Re: Counting words in text file (Mirek Fidler -- : was Java - c++, IO)
On Sun, 30 Mar 2008 03:02:07 -0700 (PDT), Mirek Fidler
<cxl@ntllib.org> wrote:
Well, yes and no. I am trying to test every single benchmark I
encounter. If I find deficiency, I am trying to fix it
I wrote a third version that is faster than the second version :)
http://pastebin.com/f691e5e86
For 3 meg file
Time: 578 ms (First version)
Time: 422 ms (Second version)
Time: 360 ms (Third version)
don't use -server with smaller files like 3 meg. Client java is
faster.
Now with 40 meg file (using -server this time)
Time: 4922 ms (First version)
Time: 3422 ms (Second version)
Time: 2797 ms (Third version)
:) :) :) :) :)
VC++
Time: 531 ms (3 meg)
Time: 5296 ms (40)
Now that's slow ...
U++
Time: 78 ms (3 meg)
Time: 828 ms (40 meg)
Now that's really fast :)
Third version below
Also, posted here http://pastebin.com/f691e5e86
-----------------------
//counts the words in a text file...
//combined effort: wlfshmn from #java on IRC Undernet
//and RAZII
import java.io.*;
import java.util.*;
import java.nio.*;
import java.nio.channels.*;
public final class WordCount3
{
private static final Map<String, int[]> dictionary =
new HashMap<String, int[]>(800000);
private static int tWords = 0;
private static int tLines = 0;
private static long tBytes = 0;
public static void main(final String[] args) throws Exception
{
System.out.println("Lines\tWords\tBytes\tFile\n");
//TIME STARTS HERE
final long start = System.currentTimeMillis();
for (String arg : args)
{
File file = new File(arg);
if (!file.isFile())
{
continue;
}
int numLines = 0;
int numWords = 0;
long numBytes = file.length();
ByteBuffer in = new FileInputStream(arg).getChannel().map(
FileChannel.MapMode.READ_ONLY, 0, file.length());
StringBuilder sb = new StringBuilder();
boolean inword = false;
in.rewind();
for (int i = 0; i < numBytes; i++)
{
char c = (char )in.get();
if (c == '\n')
numLines++;
else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')
{
sb.append(c);
inword = true;
}
else if (inword)
{
numWords++;
int[] count = dictionary.get(sb.toString());
if (count != null)
{ count[0]++;}
else
{dictionary.put(sb.toString(), new int[]{1});}
sb.delete(0, sb.length());
inword = false;
}
}
System.out.println( numLines + "\t" + numWords + "\t" + numBytes +
"\t" + arg);
tLines += numLines;
tWords += numWords;
tBytes += numBytes;
}
//only converting it to TreepMap so the result
//appear ordered, I could have
//moved this part down to printing phase
//(i.e. not include it in time).
TreeMap<String, int[] > sort = new TreeMap<String, int[]>
(dictionary);
//TIME ENDS HERE
final long end = System.currentTimeMillis();
System.out.println("---------------------------------------");
if (args.length > 1)
{
System.out.println(tLines + "\t" + tWords + "\t" + tBytes +
"\tTotal");
System.out.println("---------------------------------------");
}
for (Map.Entry<String, int[]> pairs : sort.entrySet())
{
System.out.println(pairs.getValue()[0] + "\t" + pairs.getKey());
}
System.out.println("Time: " + (end - start) + " ms");
}
}