keyword-extraction/keywordprocessor.go
loser bff9922066 更新 'keywordprocessor.go'
Signed-off-by: loser <1711788888@qq.com>
2023-11-10 09:18:17 +08:00

182 lines
4.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package extractor
import (
"unicode"
"unicode/utf8"
)
type WalkFn func(start, end int) bool
type KeywordProcessor struct {
root *Node
whiteSpaceChars []string // 不知道有啥用反正python也定义了就跟着定义一个吧
nonWordBoundaries string
caseSensitive bool // 匹配是否区分大小写
}
func NewKeywordProcessor(caseSensitive bool) *KeywordProcessor {
return &KeywordProcessor{
root: newNode(),
caseSensitive: caseSensitive,
whiteSpaceChars: []string{".", "\t", "\n", "\a", " ", ","}, // python 就是这样定义的,而且它也没有用到,所以先留着这里吧
}
}
func (kp *KeywordProcessor) setItem(keyword string) {
if len(keyword) == 0 {
return
}
node := kp.root
for _, char := range keyword {
if !kp.caseSensitive {
char = unicode.ToLower(char)
}
if _, ok := node.children[char]; !ok {
// 子节点已存在 将子节点设为当前节点
node.children[char] = newNode()
}
node = node.children[char]
}
// 记录当前匹配词的长度
node.exist[len(keyword)] = struct{}{}
}
func (kp *KeywordProcessor) Build() {
// 创建一个空队列,用于层级遍历 Trie 树节点
queue := make([]*Node, 0)
// 将根节点作为初始节点
queue = append(queue, kp.root)
for len(queue) > 0 {
// 获取队列中的第一个节点
currentNode := queue[0]
// 弹出队列中的第一个节点
queue = queue[1:]
// 遍历当前节点的子节点
for char, childNode := range currentNode.children {
// 将子节点添加到队列中
queue = append(queue, childNode)
// 当前节点父节点的失败指针
faFail := currentNode.failure
// 递归获取父失败指针 直到获取到根节点为止
for faFail != nil && faFail.children[char] == nil {
faFail = faFail.failure
}
childNode.failure = kp.root
if faFail != nil {
childNode.failure = faFail.children[char]
}
for key := range childNode.failure.exist {
childNode.exist[key] = struct{}{}
}
}
}
}
func (kp *KeywordProcessor) AddKeyWord(keyword string) *KeywordProcessor {
kp.setItem(keyword)
return kp
}
func (kp *KeywordProcessor) AddKeywordsFromList(keywords []string) *KeywordProcessor {
for _, keyword := range keywords {
kp.setItem(keyword)
}
return kp
}
func (kp *KeywordProcessor) walk(sentence string, wf WalkFn) {
// 从根节点开始查找
currentNode := kp.root
// 遍历文本 sentence 的每个字符,并记录当前字符的索引为 idx当前字符为 r
idx := 0
for len(sentence) > 0 {
r, l := utf8.DecodeRuneInString(sentence)
idx += l
sentence = sentence[l:]
if !kp.caseSensitive {
r = unicode.ToLower(r)
}
// 在循环中 判断是否有当前字符子节点
//如果不存在,则说明匹配失败,需要通过失败路径回溯到前一个节点,直到找到一个匹配的子节点或回溯到根节点。
for currentNode.children[r] == nil && currentNode.failure != nil {
currentNode = currentNode.failure
}
if currentNode.children[r] == nil {
continue
}
currentNode = currentNode.children[r]
for length := range currentNode.exist {
if !wf(idx-length, idx) {
return
}
}
}
}
func (kp *KeywordProcessor) walkByte(sentence []byte, wf WalkFn) {
// 从根节点开始查找
currentNode := kp.root
// 遍历文本 sentence 的每个字符,并记录当前字符的索引为 idx当前字符为 r
idx := 0
for len(sentence) > 0 {
r, l := utf8.DecodeRune(sentence)
idx += l
sentence = sentence[l:]
if !kp.caseSensitive {
r = unicode.ToLower(r)
}
// 在循环中 判断是否有当前字符子节点
//如果不存在,则说明匹配失败,需要通过失败路径回溯到前一个节点,直到找到一个匹配的子节点或回溯到根节点。
for currentNode.children[r] == nil && currentNode.failure != nil {
currentNode = currentNode.failure
}
if currentNode.children[r] == nil {
continue
}
currentNode = currentNode.children[r]
for length := range currentNode.exist {
if !wf(idx-length, idx) {
return
}
}
}
}
// ExtractKeywords 匹配关键词
func (kp *KeywordProcessor) ExtractKeywords(sentence string) []Match {
var matches []Match
if len(sentence) == 0 {
return matches
}
kp.walk(sentence, func(start, end int) bool {
matches = append(matches, Match{
start: start,
end: end,
match: sentence[start:end],
})
return true
})
return matches
}
func (kp *KeywordProcessor) ExtractKeywordsFromBytes(sentence []byte) []Match {
var matches []Match
if len(sentence) == 0 {
return matches
}
kp.walkByte(sentence, func(start, end int) bool {
matches = append(matches, Match{
start: start,
end: end,
match: string(sentence[start:end]),
})
return true
})
return matches
}