-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranslate_parse.go
More file actions
118 lines (93 loc) · 2.28 KB
/
translate_parse.go
File metadata and controls
118 lines (93 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package main
import (
"bytes"
"gopkg.in/jdkato/prose.v2"
"log"
"math"
"regexp"
"strings"
)
const SPACE_SUB = "|"
func TranslateTextWithParse(langFrom, langTo, text string, maxAlt int) string {
var modifiedText = strings.Replace(text, " ", " "+SPACE_SUB+" ", -1)
doc, err := prose.NewDocument(modifiedText)
if err != nil {
log.Fatal(err)
return "Sorry, cannot build a new document from text " + text
}
var res bytes.Buffer
for _, seq := range splitToSequences(doc.Tokens()) {
if dictMap[seq] != nil {
res.WriteString(translationWords(dictMap[seq], maxAlt))
continue
}
res.WriteString(seq)
}
return res.String()
}
func splitToSequences(tokens []prose.Token) []string {
var result []string
seqSoFar := ""
distSoFar := math.MaxInt64
// start building a next sequence
resetSeq := func() {
seqSoFar = ""
distSoFar = math.MaxInt64
}
// set current sequence
setSeq := func(newDictSeq string, newDistSoFar int) {
seqSoFar = newDictSeq
distSoFar = newDistSoFar
}
spaceBack := func(str string) string {
if str == SPACE_SUB {
return " "
}
return str
}
for _, tok := range tokens {
// we use this regexp in order to determine whether
// a certain token should be translated
shouldBeTranslated := regexp.MustCompile("^[A-Z]+$")
if !shouldBeTranslated.MatchString(tok.Tag) {
// non empty seqSoFar?
if seqSoFar != "" {
result = append(result, seqSoFar)
resetSeq()
}
result = append(result, tok.Text)
continue
}
var lowTokenText = strings.ToLower(spaceBack(tok.Text))
// sequence extension and flush related logic
newSeq := lowTokenText
if seqSoFar != "" {
newSeq = seqSoFar + newSeq
}
dictSeq, dist := findByMinDist(newSeq)
if dictSeq != "" {
// better sequence? increase
if dist < distSoFar {
setSeq(dictSeq, dist)
continue
}
}
// does not make sense to continue, so flushing a previous sequence and start with a new one
if seqSoFar != "" {
result = append(result, seqSoFar)
}
// deal with the current token
dictSeq, _ = findByMinDist(lowTokenText)
if dictSeq == "" {
result = append(result, spaceBack(tok.Text))
resetSeq()
} else {
setSeq(findByMinDist(lowTokenText))
}
}
// final flush
if seqSoFar != "" {
result = append(result, seqSoFar)
}
return result
}