2019-04-22 02:59:20 +00:00

350 lines
8.6 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2013 Hui Chen
// Copyright 2016 ego authors
//
// Licensed under the Apache License, Version 2.0 (the "License"): you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
package riot
import (
// "fmt"
"strings"
"github.com/go-ego/gpy"
"github.com/go-ego/riot/types"
)
// TMap defines the tokens map type map[string][]int
type TMap map[string][]int
type segmenterReq struct {
docId uint64
hash uint32
data types.DocData
// data types.DocumentIndexData
forceUpdate bool
}
// ForSplitData for split segment's data, segspl
func (engine *Engine) ForSplitData(strData []string, num int) (TMap, int) {
var (
numTokens int
splitStr string
)
tokensMap := make(map[string][]int)
for i := 0; i < num; i++ {
if strData[i] != "" {
if !engine.stopTokens.IsStopToken(strData[i]) {
numTokens++
tokensMap[strData[i]] = append(tokensMap[strData[i]], numTokens)
}
splitStr += strData[i]
if !engine.stopTokens.IsStopToken(splitStr) {
numTokens++
tokensMap[splitStr] = append(tokensMap[splitStr], numTokens)
}
if engine.initOptions.Using == 6 {
// more combination
var splitsStr string
for s := i + 1; s < len(strData); s++ {
splitsStr += strData[s]
if !engine.stopTokens.IsStopToken(splitsStr) {
numTokens++
tokensMap[splitsStr] = append(tokensMap[splitsStr], numTokens)
}
}
}
}
}
return tokensMap, numTokens
}
func (engine *Engine) splitData(request segmenterReq) (TMap, int) {
var (
num int
numTokens int
)
tokensMap := make(map[string][]int)
if request.data.Content != "" {
content := strings.ToLower(request.data.Content)
if engine.initOptions.Using == 3 {
// use segmenter
segments := engine.segmenter.ModeSegment([]byte(content),
engine.initOptions.GseMode)
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
numTokens += len(segments)
}
if engine.initOptions.Using == 4 {
tokensMap, numTokens = engine.defaultTokens(content)
}
if engine.initOptions.Using != 4 {
strData := strings.Split(content, "")
num = len(strData)
tokenMap, numToken := engine.ForSplitData(strData, num)
numTokens += numToken
for key, val := range tokenMap {
tokensMap[key] = val
}
}
}
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens += len(request.data.Tokens)
return tokensMap, numTokens
}
func (engine *Engine) segmenterData(request segmenterReq) (TMap, int) {
tokensMap := make(map[string][]int)
numTokens := 0
if engine.initOptions.Using == 0 && request.data.Content != "" {
// Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
engine.initOptions.GseMode)
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens = len(segments) + len(request.data.Tokens)
return tokensMap, numTokens
}
if engine.initOptions.Using == 1 && request.data.Content != "" {
// Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
engine.initOptions.GseMode)
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
numTokens = len(segments)
return tokensMap, numTokens
}
if engine.initOptions.Using == 2 ||
((engine.initOptions.Using == 1 || engine.initOptions.Using == 3) &&
request.data.Content == "") {
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens = len(request.data.Tokens)
return tokensMap, numTokens
}
tokenMap, lenSplitData := engine.splitData(request)
return tokenMap, lenSplitData
}
func (engine *Engine) defaultTokens(content string) (tokensMap TMap, numTokens int) {
// use segmenter
tokensMap = make(map[string][]int)
strData := strings.Split(content, " ")
num := len(strData)
// if num == 1 {
// tokensMap[request.data.Content] = []int{1}
// }
if num > 0 {
tokenMap, numToken := engine.ForSplitData(strData, num)
numTokens += numToken
for key, val := range tokenMap {
tokensMap[key] = val
}
}
return
}
func (engine *Engine) segmenterWorker() {
for {
request := <-engine.segmenterChan
if request.docId == 0 {
if request.forceUpdate {
for i := 0; i < engine.initOptions.NumShards; i++ {
engine.indexerAddDocChans[i] <- indexerAddDocReq{
forceUpdate: true}
}
}
continue
}
shard := engine.getShard(request.hash)
tokensMap := make(map[string][]int)
numTokens := 0
if !(engine.initOptions.NotUseGse && engine.initOptions.Using == 0) {
tokensMap, numTokens = engine.segmenterData(request)
} else {
if request.data.Content != "" {
content := strings.ToLower(request.data.Content)
tokensMap, numTokens = engine.defaultTokens(content)
}
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens += len(request.data.Tokens)
}
// 加入非分词的文档标签
for _, label := range request.data.Labels {
if !engine.initOptions.NotUseGse {
if !engine.stopTokens.IsStopToken(label) {
// 当正文中已存在关键字时,若不判断,位置信息将会丢失
if _, ok := tokensMap[label]; !ok {
tokensMap[label] = []int{}
}
}
} else {
// 当正文中已存在关键字时,若不判断,位置信息将会丢失
if _, ok := tokensMap[label]; !ok {
tokensMap[label] = []int{}
}
}
}
indexerRequest := indexerAddDocReq{
doc: &types.DocIndex{
DocId: request.docId,
TokenLen: float32(numTokens),
Keywords: make([]types.KeywordIndex, len(tokensMap)),
},
forceUpdate: request.forceUpdate,
}
iTokens := 0
for k, v := range tokensMap {
indexerRequest.doc.Keywords[iTokens] = types.KeywordIndex{
Text: k,
// 非分词标注的词频设置为0不参与tf-idf计算
Frequency: float32(len(v)),
Starts: v}
iTokens++
}
engine.indexerAddDocChans[shard] <- indexerRequest
if request.forceUpdate {
for i := 0; i < engine.initOptions.NumShards; i++ {
if i == shard {
continue
}
engine.indexerAddDocChans[i] <- indexerAddDocReq{forceUpdate: true}
}
}
rankerRequest := rankerAddDocReq{
// docId: request.docId, fields: request.data.Fields}
docId: request.docId, fields: request.data.Fields,
content: request.data.Content, attri: request.data.Attri}
engine.rankerAddDocChans[shard] <- rankerRequest
}
}
// PinYin get the Chinese alphabet and abbreviation
func (engine *Engine) PinYin(hans string) []string {
var (
str string
pyStr string
strArr []string
splitStr string
// splitArr []string
)
//
splitHans := strings.Split(hans, "")
for i := 0; i < len(splitHans); i++ {
if splitHans[i] != "" {
if !engine.stopTokens.IsStopToken(splitHans[i]) {
strArr = append(strArr, splitHans[i])
}
splitStr += splitHans[i]
}
if !engine.stopTokens.IsStopToken(splitStr) {
strArr = append(strArr, splitStr)
}
}
// Segment 分词
if !engine.initOptions.NotUseGse {
sehans := engine.Segment(hans)
for h := 0; h < len(sehans); h++ {
if !engine.stopTokens.IsStopToken(sehans[h]) {
strArr = append(strArr, sehans[h])
}
}
}
//
// py := pinyin.LazyConvert(sehans[h], nil)
py := gpy.LazyConvert(hans, nil)
// log.Println("py...", py)
for i := 0; i < len(py); i++ {
// log.Println("py[i]...", py[i])
pyStr += py[i]
if !engine.stopTokens.IsStopToken(pyStr) {
strArr = append(strArr, pyStr)
}
if len(py[i]) > 0 {
str += py[i][0:1]
if !engine.stopTokens.IsStopToken(str) {
strArr = append(strArr, str)
}
}
}
return strArr
}