Counting words in text file (Mirek Fidler -- : was Java - c++, IO)
On Wed, 26 Mar 2008 00:45:32 -0700 (PDT), Mirek Fidler
<cxl@ntllib.org> wrote:
Anyway, Razii, there is a nice benchmark at the end of this page:
http://www.digitalmars.com/d/2.0/cppstrings.html
Maybe you could create and benchmark effective Java implementation. I
would be glad to add such version to the comparison here:
First, time starts in main(). It's a short text file and if you
include VM load time, the test is invalid. The VM load time would be
longer than task itself. If not, use JET compiler so there is no VM
load time.
Also, how are you calling your page "Strings" when this supposed
benchmark spends 90% of the time in I/O reading and writing data to
disk? To reduce I/O factor, time counting ends before the output is
printed. I modified the c++ version so it has internal time counter.
The time will be printed at the end of log.txt file.
Here are text file: "Alice in Wonderland." 160 kb
http://www.gutenberg.org/dirs/etext91/alice30.txt
bible.txt (3 meg)
http://www.cas.mcmaster.ca/~bill/strings/english/bible
And I made bible.txt into 40 meg, that is bible2.txt
c++ version compiled with VC++
cl /O2 /GL wc1.cpp /link /ltcg
C:\>java -server WordCount alice30.txt>log.txt
Time: 266 ms
C:\>java WordCount alice30.txt>log.txt
Time: 78 ms
C:\>wc1 alice30.txt>log2.txt
Time: 31 ms
For a short running program, java -server was much slower than java
client (due to load factor?). c++ version is 2 times faster than Java
client.
C:\>java -server WordCount bible.txt>log.txt
Time: 781 ms
C:\>java WordCount bible.txt>log.txt
Time: 625 ms
C:\>wc1 alice30.bible>log2.txt
Time: 578 ms
Time differences between java and c++ reduced with larger txt file,
bible.txt
C:\>java -server WordCount bible2.txt>log.txt
Time: 5297 ms
C:\>java WordCount bible2.txt>log.txt
Time: 5421 ms
C:\>wc1 alice30.bible2>log2.txt
Time: 5750 ms
C++ loses to both java client and server with 40 meg bible2.txt.
C:\>java -server WordCount alice30.txt bible.txt bible2.txt>log.txt
Time: 5687 ms
C:\>java WordCount alice30.txt bible.txt bible2.txt>log.txt
Time: 6218 ms
C:\>wc1 alice30.txt bible.txt bible2.txt>log2.txt
Time: 6531 ms
When all three files included together at command line, c++ is one sec
slower than java -server!
Bother java and c++ versions are below.
== JAVA ==
Also, posted here in case you can't read it here
http://pastebin.com/f827de83
//counts the words in a text file...
import java.io.*;
import java.util.*;
public class WordCount {
static Map<String, Integer> dictionary =
new HashMap <String, Integer> (14000);
static int tWords = 0;
static int tLines = 0;
static long tBytes = 0;
public static void main(String[] args)
throws Exception {
System.out.println("Lines\tWords\tBytes\tFile\n");
//TIME STARTS HERE
long start = System.currentTimeMillis();
for (int i = 0; i < args.length; i++) {
File file = new File(args[i]);
if (!file.isFile()) {
continue;
}
int numLines = 0;
int numWords = 0;
long numBytes = file.length();
Integer I1 = new Integer(1);
BufferedReader input = new BufferedReader(new
InputStreamReader(new FileInputStream(args[i]),
"ISO-8859-1"));
StreamTokenizer st = new StreamTokenizer(input);
st.ordinaryChar('/'); st.ordinaryChar('.');
st.ordinaryChar('-'); st.ordinaryChar('"');
st.ordinaryChar('\''); st.eolIsSignificant(true);
String s;
while (st.nextToken() != StreamTokenizer.TT_EOF) {
if (st.ttype == StreamTokenizer.TT_EOL) {
numLines++;
}
else if (st.ttype == StreamTokenizer.TT_WORD) {
numWords++;
s = st.sval;
if (dictionary.containsKey(s)) {
Integer ii = dictionary.get(s);
dictionary.put(s, ++ii);
} else {
dictionary.put(s, I1);
}
}
}
System.out.println(
numLines + "\t" + numWords + "\t" + numBytes + "\t" +
args[i]);
tLines += numLines;
tWords += numWords;
tBytes += numBytes;
}
//only converting it to TreepMap so the result appear
//ordered, I could have moved this part
//down to printing phase (i.e. not include it in time).
TreeMap<String, Integer> tp = new TreeMap<String, Integer>
(dictionary);
//TIME ENDS HERE
long end = System.currentTimeMillis();
System.out.println("---------------------------------------");
if (args.length > 1) {
System.out.println(
tLines + "\t" + tWords + "\t" + tBytes + "\tTotal");
System.out.println("---------------------------------------");
}
Iterator it = tp.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry)it.next();
System.out.println(pairs.getValue() + "\t" + pairs.getKey());
}
System.out.println("Time: " + (end - start) + " ms");
}
}
==C++===
If it doesn't work, try
http://pastebin.com/f6d921545
//Added time...originally by
//Newsgroups: comp.lang.c++.moderated
//From: "Vadim Ferderer" <sp...@ferderer.de>
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <cstdio>
#include <map>
#include <ctime>
int main( int argc, char* argv[] )
{
int w_total = 0;
int l_total = 0;
int c_total = 0;
std::map< std::string, int > dictionary;
printf(" lines words bytes file\n" );
//TIME STARTS HERE
clock_t start=clock();
for ( int i = 1; i < argc; ++i )
{
std::ifstream input_file( argv[i] );
std::ostringstream buffer;
buffer << input_file.rdbuf();
std::string input( buffer.str() );
int w_cnt = 0;
int l_cnt = 0;
int c_cnt = 0;
bool inword = false;
int wstart = 0;
for ( unsigned int j = 0; j < input.length(); j++ )
{
char c = input[j];
if (c == '\n')
++l_cnt;
if (c >= '0' && c <= '9')
{
}
else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')
{
if (!inword)
{
wstart = j;
inword = true;
++w_cnt;
}
}
else if (inword)
{
std::string word = input.substr( wstart, j - wstart );
std::map< std::string, int >::iterator it = dictionary.find(
word );
if ( it == dictionary.end() )
dictionary[word] = 1;
else
++it->second;
inword = false;
}
++c_cnt;
}
if (inword)
{
std::string w = input.substr( wstart );
std::map< std::string, int >::iterator it = dictionary.find( w
);
if ( it == dictionary.end() )
dictionary[w] = 1;
else
++it->second;
}
printf("%d\t%d\t%d\t %s\n", l_cnt, w_cnt, c_cnt, argv[i]);
l_total += l_cnt;
w_total += w_cnt;
c_total += c_cnt;
}
//TIME ENDS HERE
clock_t end=clock();
if (argc > 2)
{
printf("--------------------------------------\n%d\t%\d\t%d\t
total",
l_total, w_total, c_total);
}
printf("--------------------------------------\n");
for( std::map< std::string, int >::const_iterator cit =
dictionary.begin(), cend_it = dictionary.end(); cit != cend_it; ++cit
)
printf( "%d %s\n", cit->second, cit->first.c_str() );
int time = int(end-start)/CLOCKS_PER_SEC * 1000;
std::cout <<"Time: " <<
double(end-start)/CLOCKS_PER_SEC * 1000 << " ms\n";
}