update to 0.1c version

891d84c6 · tmikolov · 5815e5d0 · 891d84c6 · 891d84c6 · 891d84c6
Commit 891d84c6 authored Sep 06, 2014 by tmikolov
8 changed files
--- a/demo-analogy.sh
+++ b/demo-analogy.sh
@@ -3,9 +3,9 @@ if [ ! -e text8 ]; then
  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
  gzip -d text8.gz -f
 fi
-echo -----------------------------------------------------------------------------------------------------
-echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
+echo ---------------------------------------------------------------------------------------------------
+echo Note that for the word analogy to perform well, the model should be trained on much larger data set
 echo Example input: paris france berlin
-echo -----------------------------------------------------------------------------------------------------
-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+echo ---------------------------------------------------------------------------------------------------
+time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 ./word-analogy vectors.bin
--- a/demo-classes.sh
+++ b/demo-classes.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
  gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
+time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
 sort classes.txt -k 2 -n > classes.sorted.txt
 echo The word classes were saved to file classes.sorted.txt
--- a/demo-phrase-accuracy.sh
+++ b/demo-phrase-accuracy.sh
 make
-if [ ! -e text8 ]; then
-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-  gzip -d text8.gz -f
+if [ ! -e news.2012.en.shuffled ]; then
+  wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
+  gzip -d news.2012.en.shuffled.gz -f
 fi
-echo ----------------------------------------------------------------------------------------------------------------
-echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
-echo To achieve better accuracy, larger training set is needed
-echo ----------------------------------------------------------------------------------------------------------------
-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
-./compute-accuracy vectors-phrase.bin <questions-phrases.txt
+sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+./compute-accuracy vectors-phrase.bin < questions-phrases.txt
--- a/demo-phrases.sh
+++ b/demo-phrases.sh
 make
-if [ ! -e text8 ]; then
-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-  gzip -d text8.gz -f
+if [ ! -e news.2012.en.shuffled ]; then
+  wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
+  gzip -d news.2012.en.shuffled.gz -f
 fi
-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-./distance vectors-phrase.bin
\ No newline at end of file
+sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+./distance vectors-phrase.bin
--- a/demo-word-accuracy.sh
+++ b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
  gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 ./compute-accuracy vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
--- a/demo-word.sh
+++ b/demo-word.sh
@@ -3,5 +3,5 @@ if [ ! -e text8 ]; then
  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
  gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-./distance vectors.bin
\ No newline at end of file
+time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
+./distance vectors.bin
--- a/makefile
+++ b/makefile
 CC = gcc
-#The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
+#Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
+CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result

 all: word2vec word2phrase distance word-analogy compute-accuracy


--- a/word2vec.c
+++ b/word2vec.c