Kevin Hu
commited on
Commit
·
1a2e406
1
Parent(s):
d3d83ec
Refine synonym query. (#3855)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- conf/mapping.json +9 -1
- rag/nlp/query.py +16 -9
conf/mapping.json
CHANGED
|
@@ -140,13 +140,21 @@
|
|
| 140 |
}
|
| 141 |
},
|
| 142 |
{
|
| 143 |
-
"
|
| 144 |
"match": "*_fea",
|
| 145 |
"mapping": {
|
| 146 |
"type": "rank_feature"
|
| 147 |
}
|
| 148 |
}
|
| 149 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
{
|
| 151 |
"dense_vector": {
|
| 152 |
"match": "*_512_vec",
|
|
|
|
| 140 |
}
|
| 141 |
},
|
| 142 |
{
|
| 143 |
+
"rank_feature": {
|
| 144 |
"match": "*_fea",
|
| 145 |
"mapping": {
|
| 146 |
"type": "rank_feature"
|
| 147 |
}
|
| 148 |
}
|
| 149 |
},
|
| 150 |
+
{
|
| 151 |
+
"rank_features": {
|
| 152 |
+
"match": "*_feas",
|
| 153 |
+
"mapping": {
|
| 154 |
+
"type": "rank_features"
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
},
|
| 158 |
{
|
| 159 |
"dense_vector": {
|
| 160 |
"match": "*_512_vec",
|
rag/nlp/query.py
CHANGED
|
@@ -120,7 +120,7 @@ class FulltextQueryer:
|
|
| 120 |
keywords.append(tt)
|
| 121 |
twts = self.tw.weights([tt])
|
| 122 |
syns = self.syn.lookup(tt)
|
| 123 |
-
if syns: keywords.extend(syns)
|
| 124 |
logging.debug(json.dumps(twts, ensure_ascii=False))
|
| 125 |
tms = []
|
| 126 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
|
@@ -140,17 +140,24 @@ class FulltextQueryer:
|
|
| 140 |
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
| 141 |
sm = [m for m in sm if len(m) > 1]
|
| 142 |
|
| 143 |
-
keywords
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
break
|
| 147 |
|
| 148 |
tk_syns = self.syn.lookup(tk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
tk = FulltextQueryer.subSpecialChar(tk)
|
| 150 |
if tk.find(" ") > 0:
|
| 151 |
tk = '"%s"' % tk
|
| 152 |
if tk_syns:
|
| 153 |
-
tk = f"({tk} %s)" % " ".join(tk_syns)
|
| 154 |
if sm:
|
| 155 |
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
|
| 156 |
if tk.strip():
|
|
@@ -159,14 +166,14 @@ class FulltextQueryer:
|
|
| 159 |
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
| 160 |
|
| 161 |
if len(twts) > 1:
|
| 162 |
-
tms += ' ("%s"~
|
| 163 |
if re.match(r"[0-9a-z ]+$", tt):
|
| 164 |
tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
|
| 165 |
|
| 166 |
syns = " OR ".join(
|
| 167 |
[
|
| 168 |
-
'"%s"
|
| 169 |
-
% FulltextQueryer.subSpecialChar(
|
| 170 |
for s in syns
|
| 171 |
]
|
| 172 |
)
|
|
|
|
| 120 |
keywords.append(tt)
|
| 121 |
twts = self.tw.weights([tt])
|
| 122 |
syns = self.syn.lookup(tt)
|
| 123 |
+
if syns and len(keywords) < 32: keywords.extend(syns)
|
| 124 |
logging.debug(json.dumps(twts, ensure_ascii=False))
|
| 125 |
tms = []
|
| 126 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
|
|
|
| 140 |
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
| 141 |
sm = [m for m in sm if len(m) > 1]
|
| 142 |
|
| 143 |
+
if len(keywords) < 32:
|
| 144 |
+
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
| 145 |
+
keywords.extend(sm)
|
|
|
|
| 146 |
|
| 147 |
tk_syns = self.syn.lookup(tk)
|
| 148 |
+
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
|
| 149 |
+
if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
|
| 150 |
+
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
|
| 151 |
+
tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
|
| 152 |
+
|
| 153 |
+
if len(keywords) >= 32:
|
| 154 |
+
break
|
| 155 |
+
|
| 156 |
tk = FulltextQueryer.subSpecialChar(tk)
|
| 157 |
if tk.find(" ") > 0:
|
| 158 |
tk = '"%s"' % tk
|
| 159 |
if tk_syns:
|
| 160 |
+
tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
|
| 161 |
if sm:
|
| 162 |
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
|
| 163 |
if tk.strip():
|
|
|
|
| 166 |
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
| 167 |
|
| 168 |
if len(twts) > 1:
|
| 169 |
+
tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
|
| 170 |
if re.match(r"[0-9a-z ]+$", tt):
|
| 171 |
tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
|
| 172 |
|
| 173 |
syns = " OR ".join(
|
| 174 |
[
|
| 175 |
+
'"%s"'
|
| 176 |
+
% rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
|
| 177 |
for s in syns
|
| 178 |
]
|
| 179 |
)
|