Commit 891d84c6 authored by tmikolov's avatar tmikolov

update to 0.1c version

parent 5815e5d0
...@@ -3,9 +3,9 @@ if [ ! -e text8 ]; then ...@@ -3,9 +3,9 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f gzip -d text8.gz -f
fi fi
echo ----------------------------------------------------------------------------------------------------- echo ---------------------------------------------------------------------------------------------------
echo Note that for the word analogy to perform well, the models should be trained on much larger data sets echo Note that for the word analogy to perform well, the model should be trained on much larger data set
echo Example input: paris france berlin echo Example input: paris france berlin
echo ----------------------------------------------------------------------------------------------------- echo ---------------------------------------------------------------------------------------------------
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./word-analogy vectors.bin ./word-analogy vectors.bin
...@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then ...@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f gzip -d text8.gz -f
fi fi
time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500 time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
sort classes.txt -k 2 -n > classes.sorted.txt sort classes.txt -k 2 -n > classes.sorted.txt
echo The word classes were saved to file classes.sorted.txt echo The word classes were saved to file classes.sorted.txt
make make
if [ ! -e text8 ]; then if [ ! -e news.2012.en.shuffled ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
gzip -d text8.gz -f gzip -d news.2012.en.shuffled.gz -f
fi fi
echo ---------------------------------------------------------------------------------------------------------------- sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
echo To achieve better accuracy, larger training set is needed time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
echo ---------------------------------------------------------------------------------------------------------------- tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3 time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3 ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
./compute-accuracy vectors-phrase.bin <questions-phrases.txt
make make
if [ ! -e text8 ]; then if [ ! -e news.2012.en.shuffled ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
gzip -d text8.gz -f gzip -d news.2012.en.shuffled.gz -f
fi fi
time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
./distance vectors-phrase.bin time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
\ No newline at end of file tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
./distance vectors-phrase.bin
...@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then ...@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f gzip -d text8.gz -f
fi fi
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./compute-accuracy vectors.bin 30000 < questions-words.txt ./compute-accuracy vectors.bin 30000 < questions-words.txt
# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
...@@ -3,5 +3,5 @@ if [ ! -e text8 ]; then ...@@ -3,5 +3,5 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f gzip -d text8.gz -f
fi fi
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./distance vectors.bin ./distance vectors.bin
\ No newline at end of file
CC = gcc CC = gcc
#The -Ofast might not work with older versions of gcc; in that case, use -O2 #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
all: word2vec word2phrase distance word-analogy compute-accuracy all: word2vec word2phrase distance word-analogy compute-accuracy
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment