Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
word2vec
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
35
Issues
35
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
DESHPANDE SRIJAY PARAG
word2vec
Commits
891d84c6
Commit
891d84c6
authored
Sep 06, 2014
by
tmikolov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update to 0.1c version
parent
5815e5d0
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
115 additions
and
93 deletions
+115
-93
demo-analogy.sh
demo-analogy.sh
+4
-4
demo-classes.sh
demo-classes.sh
+1
-1
demo-phrase-accuracy.sh
demo-phrase-accuracy.sh
+9
-10
demo-phrases.sh
demo-phrases.sh
+9
-6
demo-word-accuracy.sh
demo-word-accuracy.sh
+1
-1
demo-word.sh
demo-word.sh
+2
-2
makefile
makefile
+2
-2
word2vec.c
word2vec.c
+87
-67
No files found.
demo-analogy.sh
View file @
891d84c6
...
...
@@ -3,9 +3,9 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip
-O
text8.gz
gzip
-d
text8.gz
-f
fi
echo
---------------------------------------------------------------------------------------------------
--
echo
Note that
for
the word analogy to perform well, the model
s should be trained on much larger data sets
echo
---------------------------------------------------------------------------------------------------
echo
Note that
for
the word analogy to perform well, the model
should be trained on much larger data
set
echo
Example input: paris france berlin
echo
---------------------------------------------------------------------------------------------------
--
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
0
-size
200
-window
5
-negative
0
-hs
1
-sample
1e-3
-threads
12
-binary
1
echo
---------------------------------------------------------------------------------------------------
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
1
-size
200
-window
8
-negative
25
-hs
0
-sample
1e-4
-threads
20
-binary
1
-iter
15
./word-analogy vectors.bin
demo-classes.sh
View file @
891d84c6
...
...
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip
-O
text8.gz
gzip
-d
text8.gz
-f
fi
time
./word2vec
-train
text8
-output
classes.txt
-cbow
0
-size
200
-window
5
-negative
0
-hs
1
-sample
1e-3
-threads
12
-classes
500
time
./word2vec
-train
text8
-output
classes.txt
-cbow
1
-size
200
-window
8
-negative
25
-hs
0
-sample
1e-4
-threads
20
-iter
15
-classes
500
sort
classes.txt
-k
2
-n
>
classes.sorted.txt
echo
The word classes were saved to file classes.sorted.txt
demo-phrase-accuracy.sh
View file @
891d84c6
make
if
[
!
-e
text8
]
;
then
wget http://
mattmahoney.net/dc/text8.zip
-O
text8
.gz
gzip
-d
text8
.gz
-f
if
[
!
-e
news.2012.en.shuffled
]
;
then
wget http://
www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled
.gz
gzip
-d
news.2012.en.shuffled
.gz
-f
fi
echo
----------------------------------------------------------------------------------------------------------------
echo
Note that the accuracy and coverage of the
test set
questions is going to be low with this small training corpus
echo
To achieve better accuracy, larger training
set
is needed
echo
----------------------------------------------------------------------------------------------------------------
time
./word2phrase
-train
text8
-output
text8-phrase
-threshold
500
-debug
2
-min-count
3
time
./word2vec
-train
text8-phrase
-output
vectors-phrase.bin
-cbow
0
-size
300
-window
10
-negative
0
-hs
1
-sample
1e-3
-threads
12
-binary
1
-min-count
3
./compute-accuracy vectors-phrase.bin <questions-phrases.txt
sed
-e
"s/’/'/g"
-e
"s/′/'/g"
-e
"s/''/ /g"
< news.2012.en.shuffled |
tr
-c
"A-Za-z'_
\n
"
" "
>
news.2012.en.shuffled-norm0
time
./word2phrase
-train
news.2012.en.shuffled-norm0
-output
news.2012.en.shuffled-norm0-phrase0
-threshold
200
-debug
2
time
./word2phrase
-train
news.2012.en.shuffled-norm0-phrase0
-output
news.2012.en.shuffled-norm0-phrase1
-threshold
100
-debug
2
tr
A-Z a-z < news.2012.en.shuffled-norm0-phrase1
>
news.2012.en.shuffled-norm1-phrase1
time
./word2vec
-train
news.2012.en.shuffled-norm1-phrase1
-output
vectors-phrase.bin
-cbow
1
-size
200
-window
10
-negative
25
-hs
0
-sample
1e-5
-threads
20
-binary
1
-iter
15
./compute-accuracy vectors-phrase.bin < questions-phrases.txt
demo-phrases.sh
View file @
891d84c6
make
if
[
!
-e
text8
]
;
then
wget http://
mattmahoney.net/dc/text8.zip
-O
text8
.gz
gzip
-d
text8
.gz
-f
if
[
!
-e
news.2012.en.shuffled
]
;
then
wget http://
www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled
.gz
gzip
-d
news.2012.en.shuffled
.gz
-f
fi
time
./word2phrase
-train
text8
-output
text8-phrase
-threshold
500
-debug
2
time
./word2vec
-train
text8-phrase
-output
vectors-phrase.bin
-cbow
0
-size
300
-window
10
-negative
0
-hs
1
-sample
1e-3
-threads
12
-binary
1
sed
-e
"s/’/'/g"
-e
"s/′/'/g"
-e
"s/''/ /g"
< news.2012.en.shuffled |
tr
-c
"A-Za-z'_
\n
"
" "
>
news.2012.en.shuffled-norm0
time
./word2phrase
-train
news.2012.en.shuffled-norm0
-output
news.2012.en.shuffled-norm0-phrase0
-threshold
200
-debug
2
time
./word2phrase
-train
news.2012.en.shuffled-norm0-phrase0
-output
news.2012.en.shuffled-norm0-phrase1
-threshold
100
-debug
2
tr
A-Z a-z < news.2012.en.shuffled-norm0-phrase1
>
news.2012.en.shuffled-norm1-phrase1
time
./word2vec
-train
news.2012.en.shuffled-norm1-phrase1
-output
vectors-phrase.bin
-cbow
1
-size
200
-window
10
-negative
25
-hs
0
-sample
1e-5
-threads
20
-binary
1
-iter
15
./distance vectors-phrase.bin
demo-word-accuracy.sh
View file @
891d84c6
...
...
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip
-O
text8.gz
gzip
-d
text8.gz
-f
fi
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
0
-size
200
-window
5
-negative
0
-hs
1
-sample
1e-3
-threads
12
-binary
1
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
1
-size
200
-window
8
-negative
25
-hs
0
-sample
1e-4
-threads
20
-binary
1
-iter
15
./compute-accuracy vectors.bin 30000 < questions-words.txt
# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
demo-word.sh
View file @
891d84c6
...
...
@@ -3,5 +3,5 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip
-O
text8.gz
gzip
-d
text8.gz
-f
fi
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
0
-size
200
-window
5
-negative
0
-hs
1
-sample
1e-3
-threads
12
-binary
1
time
./word2vec
-train
text8
-output
vectors.bin
-cbow
1
-size
200
-window
8
-negative
25
-hs
0
-sample
1e-4
-threads
20
-binary
1
-iter
15
./distance vectors.bin
makefile
View file @
891d84c6
CC
=
gcc
#
The -Ofast might not work with older versions of gcc; in that case, use -O2
CFLAGS
=
-lm
-pthread
-O
fast
-march
=
native
-Wall
-funroll-loops
-Wno-unused-result
#
Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
CFLAGS
=
-lm
-pthread
-O
3
-march
=
native
-Wall
-funroll-loops
-Wno-unused-result
all
:
word2vec word2phrase distance word-analogy compute-accuracy
...
...
word2vec.c
View file @
891d84c6
...
...
@@ -37,15 +37,15 @@ struct vocab_word {
char
train_file
[
MAX_STRING
],
output_file
[
MAX_STRING
];
char
save_vocab_file
[
MAX_STRING
],
read_vocab_file
[
MAX_STRING
];
struct
vocab_word
*
vocab
;
int
binary
=
0
,
cbow
=
0
,
debug_mode
=
2
,
window
=
5
,
min_count
=
5
,
num_threads
=
1
,
min_reduce
=
1
;
int
binary
=
0
,
cbow
=
1
,
debug_mode
=
2
,
window
=
5
,
min_count
=
5
,
num_threads
=
12
,
min_reduce
=
1
;
int
*
vocab_hash
;
long
long
vocab_max_size
=
1000
,
vocab_size
=
0
,
layer1_size
=
100
;
long
long
train_words
=
0
,
word_count_actual
=
0
,
file_size
=
0
,
classes
=
0
;
real
alpha
=
0
.
025
,
starting_alpha
,
sample
=
0
;
long
long
train_words
=
0
,
word_count_actual
=
0
,
iter
=
5
,
file_size
=
0
,
classes
=
0
;
real
alpha
=
0
.
025
,
starting_alpha
,
sample
=
1e-3
;
real
*
syn0
,
*
syn1
,
*
syn1neg
,
*
expTable
;
clock_t
start
;
int
hs
=
1
,
negative
=
0
;
int
hs
=
0
,
negative
=
5
;
const
int
table_size
=
1e8
;
int
*
table
;
...
...
@@ -337,29 +337,32 @@ void ReadVocab() {
void
InitNet
()
{
long
long
a
,
b
;
unsigned
long
long
next_random
=
1
;
a
=
posix_memalign
((
void
**
)
&
syn0
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn0
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
if
(
hs
)
{
a
=
posix_memalign
((
void
**
)
&
syn1
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn1
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
syn1
[
a
*
layer1_size
+
b
]
=
0
;
}
if
(
negative
>
0
)
{
a
=
posix_memalign
((
void
**
)
&
syn1neg
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn1neg
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
syn1neg
[
a
*
layer1_size
+
b
]
=
0
;
}
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
syn0
[
a
*
layer1_size
+
b
]
=
(
rand
()
/
(
real
)
RAND_MAX
-
0
.
5
)
/
layer1_size
;
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
{
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
syn0
[
a
*
layer1_size
+
b
]
=
(((
next_random
&
0xFFFF
)
/
(
real
)
65536
)
-
0
.
5
)
/
layer1_size
;
}
CreateBinaryTree
();
}
void
*
TrainModelThread
(
void
*
id
)
{
long
long
a
,
b
,
d
,
word
,
last_word
,
sentence_length
=
0
,
sentence_position
=
0
;
long
long
a
,
b
,
d
,
cw
,
word
,
last_word
,
sentence_length
=
0
,
sentence_position
=
0
;
long
long
word_count
=
0
,
last_word_count
=
0
,
sen
[
MAX_SENTENCE_LENGTH
+
1
];
long
long
l1
,
l2
,
c
,
target
,
label
;
long
long
l1
,
l2
,
c
,
target
,
label
,
local_iter
=
iter
;
unsigned
long
long
next_random
=
(
long
long
)
id
;
real
f
,
g
;
clock_t
now
;
...
...
@@ -374,11 +377,11 @@ void *TrainModelThread(void *id) {
if
((
debug_mode
>
1
))
{
now
=
clock
();
printf
(
"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk "
,
13
,
alpha
,
word_count_actual
/
(
real
)(
train_words
+
1
)
*
100
,
word_count_actual
/
(
real
)(
iter
*
train_words
+
1
)
*
100
,
word_count_actual
/
((
real
)(
now
-
start
+
1
)
/
(
real
)
CLOCKS_PER_SEC
*
1000
));
fflush
(
stdout
);
}
alpha
=
starting_alpha
*
(
1
-
word_count_actual
/
(
real
)(
train_words
+
1
));
alpha
=
starting_alpha
*
(
1
-
word_count_actual
/
(
real
)(
iter
*
train_words
+
1
));
if
(
alpha
<
starting_alpha
*
0
.
0001
)
alpha
=
starting_alpha
*
0
.
0001
;
}
if
(
sentence_length
==
0
)
{
...
...
@@ -400,8 +403,16 @@ void *TrainModelThread(void *id) {
}
sentence_position
=
0
;
}
if
(
feof
(
fi
))
break
;
if
(
word_count
>
train_words
/
num_threads
)
break
;
if
(
feof
(
fi
)
||
(
word_count
>
train_words
/
num_threads
))
{
word_count_actual
+=
word_count
-
last_word_count
;
local_iter
--
;
if
(
local_iter
==
0
)
break
;
word_count
=
0
;
last_word_count
=
0
;
sentence_length
=
0
;
fseek
(
fi
,
file_size
/
(
long
long
)
num_threads
*
(
long
long
)
id
,
SEEK_SET
);
continue
;
}
word
=
sen
[
sentence_position
];
if
(
word
==
-
1
)
continue
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
=
0
;
...
...
@@ -410,6 +421,7 @@ void *TrainModelThread(void *id) {
b
=
next_random
%
window
;
if
(
cbow
)
{
//train the cbow architecture
// in -> hidden
cw
=
0
;
for
(
a
=
b
;
a
<
window
*
2
+
1
-
b
;
a
++
)
if
(
a
!=
window
)
{
c
=
sentence_position
-
window
+
a
;
if
(
c
<
0
)
continue
;
...
...
@@ -417,7 +429,10 @@ void *TrainModelThread(void *id) {
last_word
=
sen
[
c
];
if
(
last_word
==
-
1
)
continue
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
+=
syn0
[
c
+
last_word
*
layer1_size
];
cw
++
;
}
if
(
cw
)
{
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
/=
cw
;
if
(
hs
)
for
(
d
=
0
;
d
<
vocab
[
word
].
codelen
;
d
++
)
{
f
=
0
;
l2
=
vocab
[
word
].
point
[
d
]
*
layer1_size
;
...
...
@@ -463,6 +478,7 @@ void *TrainModelThread(void *id) {
if
(
last_word
==
-
1
)
continue
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn0
[
c
+
last_word
*
layer1_size
]
+=
neu1e
[
c
];
}
}
}
else
{
//train skip-gram
for
(
a
=
b
;
a
<
window
*
2
+
1
-
b
;
a
++
)
if
(
a
!=
window
)
{
c
=
sentence_position
-
window
+
a
;
...
...
@@ -611,7 +627,7 @@ int ArgPos(char *str, int argc, char **argv) {
int
main
(
int
argc
,
char
**
argv
)
{
int
i
;
if
(
argc
==
1
)
{
printf
(
"WORD VECTOR estimation toolkit v 0.1
b
\n\n
"
);
printf
(
"WORD VECTOR estimation toolkit v 0.1
c
\n\n
"
);
printf
(
"Options:
\n
"
);
printf
(
"Parameters for training:
\n
"
);
printf
(
"
\t
-train <file>
\n
"
);
...
...
@@ -623,18 +639,20 @@ int main(int argc, char **argv) {
printf
(
"
\t
-window <int>
\n
"
);
printf
(
"
\t\t
Set max skip length between words; default is 5
\n
"
);
printf
(
"
\t
-sample <float>
\n
"
);
printf
(
"
\t\t
Set threshold for occurrence of words. Those that appear with higher frequency"
);
printf
(
"
in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5
\n
"
);
printf
(
"
\t\t
Set threshold for occurrence of words. Those that appear with higher frequency
in the training data
\n
"
);
printf
(
"
\t\t
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
\n
"
);
printf
(
"
\t
-hs <int>
\n
"
);
printf
(
"
\t\t
Use Hierarchical Softmax; default is
1 (0 =
not used)
\n
"
);
printf
(
"
\t\t
Use Hierarchical Softmax; default is
0 (
not used)
\n
"
);
printf
(
"
\t
-negative <int>
\n
"
);
printf
(
"
\t\t
Number of negative examples; default is
0, common values are 5
- 10 (0 = not used)
\n
"
);
printf
(
"
\t\t
Number of negative examples; default is
5, common values are 3
- 10 (0 = not used)
\n
"
);
printf
(
"
\t
-threads <int>
\n
"
);
printf
(
"
\t\t
Use <int> threads (default 1)
\n
"
);
printf
(
"
\t\t
Use <int> threads (default 12)
\n
"
);
printf
(
"
\t
-iter <int>
\n
"
);
printf
(
"
\t\t
Run more training iterations (default 5)
\n
"
);
printf
(
"
\t
-min-count <int>
\n
"
);
printf
(
"
\t\t
This will discard words that appear less than <int> times; default is 5
\n
"
);
printf
(
"
\t
-alpha <float>
\n
"
);
printf
(
"
\t\t
Set the starting learning rate; default is 0.025
\n
"
);
printf
(
"
\t\t
Set the starting learning rate; default is 0.025
for skip-gram and 0.05 for CBOW
\n
"
);
printf
(
"
\t
-classes <int>
\n
"
);
printf
(
"
\t\t
Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
\n
"
);
printf
(
"
\t
-debug <int>
\n
"
);
...
...
@@ -646,9 +664,9 @@ int main(int argc, char **argv) {
printf
(
"
\t
-read-vocab <file>
\n
"
);
printf
(
"
\t\t
The vocabulary will be read from <file>, not constructed from the training data
\n
"
);
printf
(
"
\t
-cbow <int>
\n
"
);
printf
(
"
\t\t
Use the continuous bag of words model; default is
0 (
skip-gram model)
\n
"
);
printf
(
"
\t\t
Use the continuous bag of words model; default is
1 (use 0 for
skip-gram model)
\n
"
);
printf
(
"
\n
Examples:
\n
"
);
printf
(
"./word2vec -train data.txt -output vec.txt -
debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1
\n\n
"
);
printf
(
"./word2vec -train data.txt -output vec.txt -
size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
\n\n
"
);
return
0
;
}
output_file
[
0
]
=
0
;
...
...
@@ -661,6 +679,7 @@ int main(int argc, char **argv) {
if
((
i
=
ArgPos
((
char
*
)
"-debug"
,
argc
,
argv
))
>
0
)
debug_mode
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-binary"
,
argc
,
argv
))
>
0
)
binary
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-cbow"
,
argc
,
argv
))
>
0
)
cbow
=
atoi
(
argv
[
i
+
1
]);
if
(
cbow
)
alpha
=
0
.
05
;
if
((
i
=
ArgPos
((
char
*
)
"-alpha"
,
argc
,
argv
))
>
0
)
alpha
=
atof
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-output"
,
argc
,
argv
))
>
0
)
strcpy
(
output_file
,
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-window"
,
argc
,
argv
))
>
0
)
window
=
atoi
(
argv
[
i
+
1
]);
...
...
@@ -668,6 +687,7 @@ int main(int argc, char **argv) {
if
((
i
=
ArgPos
((
char
*
)
"-hs"
,
argc
,
argv
))
>
0
)
hs
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-negative"
,
argc
,
argv
))
>
0
)
negative
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-threads"
,
argc
,
argv
))
>
0
)
num_threads
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-iter"
,
argc
,
argv
))
>
0
)
iter
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-min-count"
,
argc
,
argv
))
>
0
)
min_count
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-classes"
,
argc
,
argv
))
>
0
)
classes
=
atoi
(
argv
[
i
+
1
]);
vocab
=
(
struct
vocab_word
*
)
calloc
(
vocab_max_size
,
sizeof
(
struct
vocab_word
));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment