KevinHuSh commited on
Commit
9593f88
·
1 Parent(s): 3dfa5d8

refine code (#626)

Browse files

### What problem does this PR solve?


### Type of change

- [x] Refactoring

Files changed (1) hide show
  1. rag/nlp/huqie.py +0 -423
rag/nlp/huqie.py DELETED
@@ -1,423 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- import copy
4
- import datrie
5
- import math
6
- import os
7
- import re
8
- import string
9
- import sys
10
- from hanziconv import HanziConv
11
- from huggingface_hub import snapshot_download
12
- from nltk import word_tokenize
13
- from nltk.stem import PorterStemmer, WordNetLemmatizer
14
- from api.utils.file_utils import get_project_base_directory
15
-
16
-
17
- class Huqie:
18
- def key_(self, line):
19
- return str(line.lower().encode("utf-8"))[2:-1]
20
-
21
- def rkey_(self, line):
22
- return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
23
-
24
- def loadDict_(self, fnm):
25
- print("[HUQIE]:Build trie", fnm, file=sys.stderr)
26
- try:
27
- of = open(fnm, "r")
28
- while True:
29
- line = of.readline()
30
- if not line:
31
- break
32
- line = re.sub(r"[\r\n]+", "", line)
33
- line = re.split(r"[ \t]", line)
34
- k = self.key_(line[0])
35
- F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
36
- if k not in self.trie_ or self.trie_[k][0] < F:
37
- self.trie_[self.key_(line[0])] = (F, line[2])
38
- self.trie_[self.rkey_(line[0])] = 1
39
- self.trie_.save(fnm + ".trie")
40
- of.close()
41
- except Exception as e:
42
- print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
43
-
44
- def __init__(self, debug=False):
45
- self.DEBUG = debug
46
- self.DENOMINATOR = 1000000
47
- self.trie_ = datrie.Trie(string.printable)
48
- self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
49
-
50
- self.stemmer = PorterStemmer()
51
- self.lemmatizer = WordNetLemmatizer()
52
-
53
- self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
54
- try:
55
- self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
56
- return
57
- except Exception as e:
58
- print("[HUQIE]:Build default trie", file=sys.stderr)
59
- self.trie_ = datrie.Trie(string.printable)
60
-
61
- self.loadDict_(self.DIR_ + ".txt")
62
-
63
- def loadUserDict(self, fnm):
64
- try:
65
- self.trie_ = datrie.Trie.load(fnm + ".trie")
66
- return
67
- except Exception as e:
68
- self.trie_ = datrie.Trie(string.printable)
69
- self.loadDict_(fnm)
70
-
71
- def addUserDict(self, fnm):
72
- self.loadDict_(fnm)
73
-
74
- def _strQ2B(self, ustring):
75
- """把字符串全角转半角"""
76
- rstring = ""
77
- for uchar in ustring:
78
- inside_code = ord(uchar)
79
- if inside_code == 0x3000:
80
- inside_code = 0x0020
81
- else:
82
- inside_code -= 0xfee0
83
- if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
84
- rstring += uchar
85
- else:
86
- rstring += chr(inside_code)
87
- return rstring
88
-
89
- def _tradi2simp(self, line):
90
- return HanziConv.toSimplified(line)
91
-
92
- def dfs_(self, chars, s, preTks, tkslist):
93
- MAX_L = 10
94
- res = s
95
- # if s > MAX_L or s>= len(chars):
96
- if s >= len(chars):
97
- tkslist.append(preTks)
98
- return res
99
-
100
- # pruning
101
- S = s + 1
102
- if s + 2 <= len(chars):
103
- t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
104
- if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
105
- self.key_(t2)):
106
- S = s + 2
107
- if len(preTks) > 2 and len(
108
- preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
109
- t1 = preTks[-1][0] + "".join(chars[s:s + 1])
110
- if self.trie_.has_keys_with_prefix(self.key_(t1)):
111
- S = s + 2
112
-
113
- ################
114
- for e in range(S, len(chars) + 1):
115
- t = "".join(chars[s:e])
116
- k = self.key_(t)
117
-
118
- if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
119
- break
120
-
121
- if k in self.trie_:
122
- pretks = copy.deepcopy(preTks)
123
- if k in self.trie_:
124
- pretks.append((t, self.trie_[k]))
125
- else:
126
- pretks.append((t, (-12, '')))
127
- res = max(res, self.dfs_(chars, e, pretks, tkslist))
128
-
129
- if res > s:
130
- return res
131
-
132
- t = "".join(chars[s:s + 1])
133
- k = self.key_(t)
134
- if k in self.trie_:
135
- preTks.append((t, self.trie_[k]))
136
- else:
137
- preTks.append((t, (-12, '')))
138
-
139
- return self.dfs_(chars, s + 1, preTks, tkslist)
140
-
141
- def freq(self, tk):
142
- k = self.key_(tk)
143
- if k not in self.trie_:
144
- return 0
145
- return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
146
-
147
- def tag(self, tk):
148
- k = self.key_(tk)
149
- if k not in self.trie_:
150
- return ""
151
- return self.trie_[k][1]
152
-
153
- def score_(self, tfts):
154
- B = 30
155
- F, L, tks = 0, 0, []
156
- for tk, (freq, tag) in tfts:
157
- F += freq
158
- L += 0 if len(tk) < 2 else 1
159
- tks.append(tk)
160
- F /= len(tks)
161
- L /= len(tks)
162
- if self.DEBUG:
163
- print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
164
- return tks, B / len(tks) + L + F
165
-
166
- def sortTks_(self, tkslist):
167
- res = []
168
- for tfts in tkslist:
169
- tks, s = self.score_(tfts)
170
- res.append((tks, s))
171
- return sorted(res, key=lambda x: x[1], reverse=True)
172
-
173
- def merge_(self, tks):
174
- patts = [
175
- (r"[ ]+", " "),
176
- (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
177
- ]
178
- # for p,s in patts: tks = re.sub(p, s, tks)
179
-
180
- # if split chars is part of token
181
- res = []
182
- tks = re.sub(r"[ ]+", " ", tks).split(" ")
183
- s = 0
184
- while True:
185
- if s >= len(tks):
186
- break
187
- E = s + 1
188
- for e in range(s + 2, min(len(tks) + 2, s + 6)):
189
- tk = "".join(tks[s:e])
190
- if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
191
- E = e
192
- res.append("".join(tks[s:E]))
193
- s = E
194
-
195
- return " ".join(res)
196
-
197
- def maxForward_(self, line):
198
- res = []
199
- s = 0
200
- while s < len(line):
201
- e = s + 1
202
- t = line[s:e]
203
- while e < len(line) and self.trie_.has_keys_with_prefix(
204
- self.key_(t)):
205
- e += 1
206
- t = line[s:e]
207
-
208
- while e - 1 > s and self.key_(t) not in self.trie_:
209
- e -= 1
210
- t = line[s:e]
211
-
212
- if self.key_(t) in self.trie_:
213
- res.append((t, self.trie_[self.key_(t)]))
214
- else:
215
- res.append((t, (0, '')))
216
-
217
- s = e
218
-
219
- return self.score_(res)
220
-
221
- def maxBackward_(self, line):
222
- res = []
223
- s = len(line) - 1
224
- while s >= 0:
225
- e = s + 1
226
- t = line[s:e]
227
- while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
228
- s -= 1
229
- t = line[s:e]
230
-
231
- while s + 1 < e and self.key_(t) not in self.trie_:
232
- s += 1
233
- t = line[s:e]
234
-
235
- if self.key_(t) in self.trie_:
236
- res.append((t, self.trie_[self.key_(t)]))
237
- else:
238
- res.append((t, (0, '')))
239
-
240
- s -= 1
241
-
242
- return self.score_(res[::-1])
243
-
244
- def qie(self, line):
245
- line = self._strQ2B(line).lower()
246
- line = self._tradi2simp(line)
247
- zh_num = len([1 for c in line if is_chinese(c)])
248
- if zh_num < len(line) * 0.2:
249
- return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
250
-
251
- arr = re.split(self.SPLIT_CHAR, line)
252
- res = []
253
- for L in arr:
254
- if len(L) < 2 or re.match(
255
- r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
256
- res.append(L)
257
- continue
258
- # print(L)
259
-
260
- # use maxforward for the first time
261
- tks, s = self.maxForward_(L)
262
- tks1, s1 = self.maxBackward_(L)
263
- if self.DEBUG:
264
- print("[FW]", tks, s)
265
- print("[BW]", tks1, s1)
266
-
267
- diff = [0 for _ in range(max(len(tks1), len(tks)))]
268
- for i in range(min(len(tks1), len(tks))):
269
- if tks[i] != tks1[i]:
270
- diff[i] = 1
271
-
272
- if s1 > s:
273
- tks = tks1
274
-
275
- i = 0
276
- while i < len(tks):
277
- s = i
278
- while s < len(tks) and diff[s] == 0:
279
- s += 1
280
- if s == len(tks):
281
- res.append(" ".join(tks[i:]))
282
- break
283
- if s > i:
284
- res.append(" ".join(tks[i:s]))
285
-
286
- e = s
287
- while e < len(tks) and e - s < 5 and diff[e] == 1:
288
- e += 1
289
-
290
- tkslist = []
291
- self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
292
- res.append(" ".join(self.sortTks_(tkslist)[0][0]))
293
-
294
- i = e + 1
295
-
296
- res = " ".join(res)
297
- if self.DEBUG:
298
- print("[TKS]", self.merge_(res))
299
- return self.merge_(res)
300
-
301
- def qieqie(self, tks):
302
- tks = tks.split(" ")
303
- zh_num = len([1 for c in tks if c and is_chinese(c[0])])
304
- if zh_num < len(tks) * 0.2:
305
- res = []
306
- for tk in tks:
307
- res.extend(tk.split("/"))
308
- return " ".join(res)
309
-
310
- res = []
311
- for tk in tks:
312
- if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
313
- res.append(tk)
314
- continue
315
- tkslist = []
316
- if len(tk) > 10:
317
- tkslist.append(tk)
318
- else:
319
- self.dfs_(tk, 0, [], tkslist)
320
- if len(tkslist) < 2:
321
- res.append(tk)
322
- continue
323
- stk = self.sortTks_(tkslist)[1][0]
324
- if len(stk) == len(tk):
325
- stk = tk
326
- else:
327
- if re.match(r"[a-z\.-]+$", tk):
328
- for t in stk:
329
- if len(t) < 3:
330
- stk = tk
331
- break
332
- else:
333
- stk = " ".join(stk)
334
- else:
335
- stk = " ".join(stk)
336
-
337
- res.append(stk)
338
-
339
- return " ".join(res)
340
-
341
-
342
- def is_chinese(s):
343
- if s >= u'\u4e00' and s <= u'\u9fa5':
344
- return True
345
- else:
346
- return False
347
-
348
-
349
- def is_number(s):
350
- if s >= u'\u0030' and s <= u'\u0039':
351
- return True
352
- else:
353
- return False
354
-
355
-
356
- def is_alphabet(s):
357
- if (s >= u'\u0041' and s <= u'\u005a') or (
358
- s >= u'\u0061' and s <= u'\u007a'):
359
- return True
360
- else:
361
- return False
362
-
363
-
364
- def naiveQie(txt):
365
- tks = []
366
- for t in txt.split(" "):
367
- if tks and re.match(r".*[a-zA-Z]$", tks[-1]
368
- ) and re.match(r".*[a-zA-Z]$", t):
369
- tks.append(" ")
370
- tks.append(t)
371
- return tks
372
-
373
-
374
- hq = Huqie()
375
- qie = hq.qie
376
- qieqie = hq.qieqie
377
- tag = hq.tag
378
- freq = hq.freq
379
- loadUserDict = hq.loadUserDict
380
- addUserDict = hq.addUserDict
381
- tradi2simp = hq._tradi2simp
382
- strQ2B = hq._strQ2B
383
-
384
- if __name__ == '__main__':
385
- huqie = Huqie(debug=True)
386
- # huqie.addUserDict("/tmp/tmp.new.tks.dict")
387
- tks = huqie.qie(
388
- "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
389
- print(huqie.qieqie(tks))
390
- tks = huqie.qie(
391
- "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
392
- print(huqie.qieqie(tks))
393
- tks = huqie.qie(
394
- "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
395
- print(huqie.qieqie(tks))
396
- tks = huqie.qie(
397
- "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
398
- print(huqie.qieqie(tks))
399
- tks = huqie.qie("虽然我不怎么玩")
400
- print(huqie.qieqie(tks))
401
- tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
402
- print(huqie.qieqie(tks))
403
- tks = huqie.qie(
404
- "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
405
- print(huqie.qieqie(tks))
406
- tks = huqie.qie("这周日你去吗?这周日你有空吗?")
407
- print(huqie.qieqie(tks))
408
- tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
409
- print(huqie.qieqie(tks))
410
- tks = huqie.qie(
411
- "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
412
- print(huqie.qieqie(tks))
413
- if len(sys.argv) < 2:
414
- sys.exit()
415
- huqie.DEBUG = False
416
- huqie.loadUserDict(sys.argv[1])
417
- of = open(sys.argv[2], "r")
418
- while True:
419
- line = of.readline()
420
- if not line:
421
- break
422
- print(huqie.qie(line))
423
- of.close()