Re: Fastest! Counting words  (Mirek Fidler.. continues)
 
On Mon, 07 Apr 2008 08:31:22 -0500, Razii
<DONTwhatevere3e@hotmail.com> wrote:
----the new verion --
If you can't read this or lines are messed up, try 
http://www.pastebin.ca/975248
Well, ignore the last one. This one is even slightly faster  (0.2 sec
faster for 400 mb file). 
http://www.pastebin.ca/975396
---
//pm_kirkham 
import java.io.*;
 
public final class Wc_pmk {
        
  public static void main(final String[] args) throws Exception {
        
    final long  starttime = System.currentTimeMillis();
    
    Wc_pmk worker = new Wc_pmk(); 
    
    for (String arg : args) 
    { 
        worker.processFile(arg);
    
    }
    
    final long  stoptime = System.currentTimeMillis();
    
    worker.printResults(args.length > 0);
    
    System.out.println("pmk time " + (stoptime - starttime) + "ms");
  } //end of main
 
  int   totalWords = 0;
  int   totalLines = 0;
  int   totalBytes = 0;
  int   dictionaryCount = 0;
  
  // will fail with files with too many distinct words
  // just increase the index size in that case
  int[] dictionaryData = new int[4096 * 3072];
  int   dictionaries = 0;
  
  void processFile (String arg) throws Exception { 
    File file = new File(arg);
   
    if (!file.isFile())  return;
 
    final int numBytes = (int) file.length();
 
    FileInputStream in = new FileInputStream(arg);
 
    // index of start of current dictionary
    int dindex = 0;
    
    // buffered read:
    final byte[] buf = new byte[4096];
 
    for (int bytesLeft = numBytes; bytesLeft > 0; bytesLeft-=4096) 
      dindex = processChunk(buf, in.read(buf, 0, 4096), dindex);
     
    totalBytes += numBytes;
  }
  
  void printResults (boolean dump) {
    System.out.println("Lines\tWords\tBytes");
    System.out.println("---------------------------------------");
    System.out.println(totalLines + "\t" + totalWords + "\t" +
totalBytes + "\tTotal");
    System.out.println("---------------------------------------");
  
    if (dump) 
      dumpDictionary(0, new char[1024], 0);
    
    System.out.println("dictionaryCount: " + dictionaryCount);
  }
  
  int processChunk (byte[] buf, int len, int dindex) {
    int numLines = 0;
    int numWords = 0;
    final int[] dictionaryData = this.dictionaryData;
    int dictionaryCount = this.dictionaryCount;
    
    for (int j = 0; j < len; ++j) { 
      int c = buf[j];
 
      if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') {
        
        final int index = ((c - 'A')) + dindex;
        
        dindex = dictionaryData[index];
        
        if (dindex == 0)
          dindex = dictionaryData[index] = (++dictionaryCount)*64;
      } else {
        if (c == '\n')
          numLines++;
        
        if (dindex != 0) {
          numWords++;
          dictionaryData[dindex + 26]++;
          dindex = 0;
        }
      }
    }
      
    totalLines += numLines;
    totalWords += numWords;
    
    this.dictionaryCount = dictionaryCount;
    return dindex;
  }
 
  void dumpDictionary (int dindex, char[] buf, int buflen) {
    if (dictionaryData[dindex + 26] != 0)
      System.out.println(dictionaryData[dindex + 26] + "\t" + new
String(buf, 0, buflen));
    
    for (int i = 0; i < 64; ++i) {
      if ((dictionaryData[dindex + i] != 0) && (i != 26)) {
        buf[buflen] = (char)('A' + (i));
        dumpDictionary(dictionaryData[dindex + i], buf, buflen + 1);
      }
    }
  }
}