-
Notifications
You must be signed in to change notification settings - Fork 33
/
en_tokenizer.go
29 lines (24 loc) · 869 Bytes
/
en_tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
package shield
import (
"regexp"
"strings"
)
type enTokenizer struct {
}
func NewEnglishTokenizer() Tokenizer {
return &enTokenizer{}
}
func (t *enTokenizer) Tokenize(text string) (words map[string]int64) {
words = make(map[string]int64)
for _, w := range splitTokenRx.Split(text, -1) {
if len(w) > 2 {
words[strings.ToLower(w)]++
}
}
return
}
// Spamassassin stoplist
//
// http://wiki.apache.org/spamassassin/BayesStopList
//
var splitTokenRx = regexp.MustCompile(`[^\w]+|able|all|already|and|any|are|because|both|can|come|each|email|even|few|first|for|from|give|has|have|http|information|into|it's|just|know|like|long|look|made|mail|mailing|mailto|make|many|more|most|much|need|not|now|number|off|one|only|out|own|people|place|right|same|see|such|that|the|this|through|time|using|web|where|why|with|without|work|world|year|years|you|you're|your`)