Re: Fastest! Counting words (Mirek Fidler.. continues)
On Mon, 07 Apr 2008 08:31:22 -0500, Razii
<DONTwhatevere3e@hotmail.com> wrote:
----the new verion --
If you can't read this or lines are messed up, try
http://www.pastebin.ca/975248
Well, ignore the last one. This one is even slightly faster (0.2 sec
faster for 400 mb file).
http://www.pastebin.ca/975396
---
//pm_kirkham
import java.io.*;
public final class Wc_pmk {
public static void main(final String[] args) throws Exception {
final long starttime = System.currentTimeMillis();
Wc_pmk worker = new Wc_pmk();
for (String arg : args)
{
worker.processFile(arg);
}
final long stoptime = System.currentTimeMillis();
worker.printResults(args.length > 0);
System.out.println("pmk time " + (stoptime - starttime) + "ms");
} //end of main
int totalWords = 0;
int totalLines = 0;
int totalBytes = 0;
int dictionaryCount = 0;
// will fail with files with too many distinct words
// just increase the index size in that case
int[] dictionaryData = new int[4096 * 3072];
int dictionaries = 0;
void processFile (String arg) throws Exception {
File file = new File(arg);
if (!file.isFile()) return;
final int numBytes = (int) file.length();
FileInputStream in = new FileInputStream(arg);
// index of start of current dictionary
int dindex = 0;
// buffered read:
final byte[] buf = new byte[4096];
for (int bytesLeft = numBytes; bytesLeft > 0; bytesLeft-=4096)
dindex = processChunk(buf, in.read(buf, 0, 4096), dindex);
totalBytes += numBytes;
}
void printResults (boolean dump) {
System.out.println("Lines\tWords\tBytes");
System.out.println("---------------------------------------");
System.out.println(totalLines + "\t" + totalWords + "\t" +
totalBytes + "\tTotal");
System.out.println("---------------------------------------");
if (dump)
dumpDictionary(0, new char[1024], 0);
System.out.println("dictionaryCount: " + dictionaryCount);
}
int processChunk (byte[] buf, int len, int dindex) {
int numLines = 0;
int numWords = 0;
final int[] dictionaryData = this.dictionaryData;
int dictionaryCount = this.dictionaryCount;
for (int j = 0; j < len; ++j) {
int c = buf[j];
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') {
final int index = ((c - 'A')) + dindex;
dindex = dictionaryData[index];
if (dindex == 0)
dindex = dictionaryData[index] = (++dictionaryCount)*64;
} else {
if (c == '\n')
numLines++;
if (dindex != 0) {
numWords++;
dictionaryData[dindex + 26]++;
dindex = 0;
}
}
}
totalLines += numLines;
totalWords += numWords;
this.dictionaryCount = dictionaryCount;
return dindex;
}
void dumpDictionary (int dindex, char[] buf, int buflen) {
if (dictionaryData[dindex + 26] != 0)
System.out.println(dictionaryData[dindex + 26] + "\t" + new
String(buf, 0, buflen));
for (int i = 0; i < 64; ++i) {
if ((dictionaryData[dindex + i] != 0) && (i != 26)) {
buf[buflen] = (char)('A' + (i));
dumpDictionary(dictionaryData[dindex + i], buf, buflen + 1);
}
}
}
}