Counting words continues (Mirek Fidler.. continues)
The goal was to write the fastest program that counts the frequency of
each word in a 40 MB text file..
(See: http://razi2.blogspot.com/2008/04/why-is-c-slower-than-java.html
)
The winner still is the UPP version by Mirek Fidler.
However, thanks to some insight from Brettschneider, I updated the
Java version that comes pretty close
For 40 mb text file
4700 ms (version 1)
3200 ms (version 2)
2200 ms (version 3)
The new version
1200 ms (java -server)
Only 400 ms slower than UPP.
------- new version ----
http://pastebin.com/ff5b4ed4
//counts the words in a text file...
//Razii
import java.io.*;
import java.util.*;
import java.nio.*;
import java.nio.channels.*;
public final class Jwc2
{
private static final Dictionary dictionary = new Dictionary();
private static int tWords = 0;
private static int tLines = 0;
private static int tBytes = 0;
private final static int charToIndex(char c)
{
if (c <= 'Z') return c - 'A' + 26;
return c - 'a';
}
public static void main(final String[] args) throws Exception
{
System.out.println("Lines\tWords\tBytes\tFile\n");
//TIME STARTS HERE
final long start = System.currentTimeMillis();
for (String arg : args)
{
File file = new File(arg);
if (!file.isFile())
{
continue;
}
int numLines = 0;
int numWords = 0;
final int numBytes = (int) file.length();
ByteBuffer in = new FileInputStream(arg).getChannel().map
(FileChannel.MapMode.READ_ONLY, 0, numBytes);
in.rewind();
Dictionary d = dictionary;
for (int i = 0;i < numBytes;i++)
{
char c = (char)in.get();
if (c == '\n') numLines++;
else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')
{
int index = charToIndex(c);
if (d.myLinks[index] == null)
{
d.myLinks[index] = new Dictionary();
}
d = d.myLinks[index];
}
else if (d != dictionary)
{
numWords++;
d.count++;
d = dictionary;
}
}
System.out.println( numLines + "\t" + numWords + "\t" + numBytes +
"\t" + arg);
tLines += numLines;
tWords += numWords;
tBytes += numBytes;
}
//Stop time...
final long end = System.currentTimeMillis();
System.out.println("---------------------------------------");
if (args.length > 1)
{
System.out.println(tLines + "\t" + tWords + "\t" + tBytes +
"\tTotal");
System.out.println("---------------------------------------");
}
//prints words
dictionary.print("");
System.out.println("Time: " + (end - start) + " ms");
}///END OF MAIN
//private class
private final static class Dictionary
{
private static final int ALPH_SIZE = 52;
private Dictionary[] myLinks = new Dictionary[ALPH_SIZE];
private int count = 0;
public final void print(String s)
{
for(char c = 'a';c <= 'z';++c)
{
int i = charToIndex(c);
Dictionary t = myLinks[i];
if (t != null)
{
if(t.count > 0) System.out.println(t.count + "\t" + s + c);
t.print(s + c);
}
}
for(char c = 'A';c <= 'Z';++c)
{
int i = charToIndex(c);
Dictionary t = myLinks[i];
if (t != null)
{
if(t.count > 0) System.out.println(t.count + "\t" + s + c);
t.print(s + c);
}
}
}
}
}