-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsentence.go
95 lines (80 loc) · 2.45 KB
/
sentence.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package textrank
import (
"math"
"strings"
)
// minWordSentence is the minimum number of words a sentence can have to become
// a node in the graph.
const minWordSentence = 5
// RankSentences ranks the sentences in the given text based on the TextRank
// algorithm and returned a list of the ranked sentences in descending order or
// score.
func RankSentences(text string, iterations int) []string {
graph := &textgraph{}
// Setup graph.
seenNodes := make(map[string]bool) // prevent duplication
for _, token := range tokenizeSentences(text) {
if _, ok := seenNodes[token]; ok {
continue
}
graph.addNode(token, nodeInitialScore)
seenNodes[token] = true
}
linkSentences(graph)
// Score sentence nodes.
for _, node := range *graph {
node.Score = scoreNode(node, iterations)
}
return graph.normalize()
}
// linkSentences links sentence nodes within a graph.
func linkSentences(tg *textgraph) *textgraph {
seenEdges := make(map[[2]string]bool) // prevent duplication
for _, nodeA := range *tg {
for _, nodeB := range *tg {
// Disallow reflexive nodes and duplicate edges.
_, seen := seenEdges[[2]string{nodeA.Text, nodeB.Text}]
if seen || nodeA.Text == nodeB.Text {
continue
}
seenEdges[[2]string{nodeA.Text, nodeB.Text}] = true
seenEdges[[2]string{nodeB.Text, nodeA.Text}] = true
// Connect nodes based on similarity.
if sentenceSimilarity(nodeA.Text, nodeB.Text) > 1 {
nodeA.Edges = append(nodeA.Edges, nodeB)
nodeB.Edges = append(nodeB.Edges, nodeA)
}
}
}
return tg
}
// sentenceSimilarity calculates the similarity between two sentences,
// normalizing for sentence length.
func sentenceSimilarity(a, b string) float64 {
tokensA := tokenizeWords(a)
tokensB := tokenizeWords(b)
if len(tokensA) < minWordSentence || len(tokensB) < minWordSentence {
return 0
}
similarWords := make(map[string]bool)
for _, tokenA := range tokensA {
wordA := strings.ToLower(tokenA)
// Ignore stopwords. Only need to check wordA because if wordA is not a
// stopword and wordB is a stopword, then they are not going to match.
if _, ok := stopwords[wordA]; ok {
continue
}
for _, tokenB := range tokensB {
wordB := strings.ToLower(tokenB)
if strings.Compare(wordA, wordB) == 0 {
similarWords[wordA] = true
}
}
}
numSimilarWords := float64(len(similarWords))
numWordsMult := float64(len(tokensA) * len(tokensB))
if numWordsMult == 1 {
return 0
}
return numSimilarWords / math.Log(numWordsMult)
}