-
Notifications
You must be signed in to change notification settings - Fork 2
/
word2vec.go
111 lines (101 loc) · 2.44 KB
/
word2vec.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
This package is a Go implementation of the original word2vec model.
Currently only model loading is supported.
*/
package word2vec
import (
"bufio"
"encoding/binary"
"fmt"
"os"
)
// Model is the Word2vec model.
type Model struct {
Layer1Size int
Vocab map[string]int
data []float32
}
type Pair struct {
Word string
Sim float32
}
// Load the model generated by the original word2vec.
func Load(filename string) (*Model, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
reader := bufio.NewReader(file)
var vocabSize, layer1Size int
fmt.Fscanln(reader, &vocabSize, &layer1Size)
var word string
model := &Model{
Layer1Size: layer1Size,
Vocab: make(map[string]int),
data: make([]float32, layer1Size*vocabSize),
}
for i := 0; i < vocabSize; i++ {
var vector = model.Vector(i)
bytes, err := reader.ReadBytes(' ')
if err != nil {
return nil, err
}
word = string(bytes[:len(bytes)-1])
err = binary.Read(reader, binary.LittleEndian, vector)
if err != nil {
return nil, err
}
vector.Normalize()
reader.ReadByte()
model.Vocab[word] = i
}
return model, nil
}
// Vector returns the vector of i-th word.
func (m *Model) Vector(i int) Vector {
return Vector(m.data[m.Layer1Size*i : m.Layer1Size*(i+1)])
}
// Similarity returns the similarity of the two words.
func (m *Model) Similarity(x, y string) (float32, error) {
id1, ok := m.Vocab[x]
if !ok {
return 0, fmt.Errorf("Word not found: %s", x)
}
id2, ok := m.Vocab[y]
if !ok {
return 0, fmt.Errorf("Word not found: %s", y)
}
return m.Vector(id1).Dot(m.Vector(id2)), nil
}
// MostSimilar returns the most similiar n words to sum(positives) - sum(negatives).
func (m *Model) MostSimilar(positives, negatives []string, n int) ([]Pair, error) {
// Construct the target vector.
vec := Vector(make([]float32, m.Layer1Size))
for _, word := range positives {
if wordId, ok := m.Vocab[word]; !ok {
return nil, fmt.Errorf("Word not found: %s", word)
} else {
vec.Add(1, m.Vector(wordId))
}
}
for _, word := range negatives {
if wordId, ok := m.Vocab[word]; !ok {
return nil, fmt.Errorf("Word not found: %s", word)
} else {
vec.Add(-1, m.Vector(wordId))
}
}
vec.Normalize()
// Find the top similar words.
r := make([]Pair, n)
for w, i := range m.Vocab {
sim := vec.Dot(m.Vector(i))
this := Pair{w, sim}
for j := 0; j < n; j++ {
if this.Sim > r[j].Sim {
this, r[j] = r[j], this
}
}
}
return r, nil
}