-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathutils.py
127 lines (91 loc) · 2.46 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import jieba
import pyltp
import jiagu
import pyhanlp
import thulac
import pynlpir
import snownlp
import fool
class Segment:
def __init__(self):
jieba.initialize()
self.ltpseg = pyltp.Segmentor()
self.ltpseg.load('model/ltp_data_v3.4.0/cws.model')
self.thu1 = thulac.thulac(seg_only=True)
jiagu.init()
pynlpir.open()
def __del__(self):
pynlpir.close()
def jiagu(self, text):
# 甲骨分词
jiagu_result = jiagu.seg(text)
return jiagu_result
def jieba(self, text):
# 结巴分词
jieba_result = list(jieba.cut(text))
return jieba_result
def pyltp(self, text):
# 哈工大LTP
pyltp_result = self.ltpseg.segment(text)
return pyltp_result
def hanlp(self, text):
# HanLP
pyhanlp_result = []
for term in pyhanlp.HanLP.segment(text):
pyhanlp_result.append(term.word)
return pyhanlp_result
def thulac(self, text):
# 清华分词
thulac_result = self.thu1.cut(text, text=True).split()
return thulac_result
def pynlpir(self, text):
# NLPIR
pynlpir_result = pynlpir.segment(text, pos_tagging=False)
return pynlpir_result
def snownlp(self, text):
# SnowNLP
snownlp_result = snownlp.SnowNLP(text).words
return snownlp_result
def foolnltk(self, text):
# FoolNLTK
fool_result = fool.cut(text)
return fool_result
class Report:
def __init__(self):
pass
def compare_line(self, reference, candidate): # reference 标注
ref_len = len(reference.replace(' ', ''))
can_len = len(candidate.replace(' ', ''))
# if ref_len != can_len:
# print('error len')
# return None
ref_words = reference.split()
can_words = candidate.split()
ref_words_len = len(ref_words)
can_words_len = len(can_words)
ref_index = []
index = 0
for word in ref_words:
word_index = [index]
index += len(word)
word_index.append(index)
ref_index.append(word_index)
can_index = []
index = 0
for word in can_words:
word_index = [index]
index += len(word)
word_index.append(index)
can_index.append(word_index)
tmp = [val for val in ref_index if val in can_index]
acc_word_len = len(tmp)
return ref_words_len, can_words_len, acc_word_len
if __name__=='__main__':
# seg = Segment()
# text = '你好啊'
# print(seg.foolnltk(text))
report = Report()
reference = '你 只有 槽'
candidate = '你 只 有 槽'
report.compare_line(reference, candidate)
tools = ['jieba', 'hanlp', 'snownlp', 'foolnltk', 'jiagu', 'pyltp', 'thulac', 'pynlpir']