Kevin Hu
		
	commited on
		
		
					Commit 
							
							·
						
						1a2e406
	
1
								Parent(s):
							
							d3d83ec
								
Refine synonym query. (#3855)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- conf/mapping.json +9 -1
- rag/nlp/query.py +16 -9
    	
        conf/mapping.json
    CHANGED
    
    | @@ -140,13 +140,21 @@ | |
| 140 | 
             
                    }
         | 
| 141 | 
             
                  },
         | 
| 142 | 
             
                  {
         | 
| 143 | 
            -
                    " | 
| 144 | 
             
                      "match": "*_fea",
         | 
| 145 | 
             
                      "mapping": {
         | 
| 146 | 
             
                        "type": "rank_feature"
         | 
| 147 | 
             
                      }
         | 
| 148 | 
             
                    }
         | 
| 149 | 
             
                  },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 150 | 
             
                  {
         | 
| 151 | 
             
                    "dense_vector": {
         | 
| 152 | 
             
                      "match": "*_512_vec",
         | 
|  | |
| 140 | 
             
                    }
         | 
| 141 | 
             
                  },
         | 
| 142 | 
             
                  {
         | 
| 143 | 
            +
                    "rank_feature": {
         | 
| 144 | 
             
                      "match": "*_fea",
         | 
| 145 | 
             
                      "mapping": {
         | 
| 146 | 
             
                        "type": "rank_feature"
         | 
| 147 | 
             
                      }
         | 
| 148 | 
             
                    }
         | 
| 149 | 
             
                  },
         | 
| 150 | 
            +
                  {
         | 
| 151 | 
            +
                    "rank_features": {
         | 
| 152 | 
            +
                      "match": "*_feas",
         | 
| 153 | 
            +
                      "mapping": {
         | 
| 154 | 
            +
                        "type": "rank_features"
         | 
| 155 | 
            +
                      }
         | 
| 156 | 
            +
                    }
         | 
| 157 | 
            +
                  },
         | 
| 158 | 
             
                  {
         | 
| 159 | 
             
                    "dense_vector": {
         | 
| 160 | 
             
                      "match": "*_512_vec",
         | 
    	
        rag/nlp/query.py
    CHANGED
    
    | @@ -120,7 +120,7 @@ class FulltextQueryer: | |
| 120 | 
             
                        keywords.append(tt)
         | 
| 121 | 
             
                        twts = self.tw.weights([tt])
         | 
| 122 | 
             
                        syns = self.syn.lookup(tt)
         | 
| 123 | 
            -
                        if syns: keywords.extend(syns)
         | 
| 124 | 
             
                        logging.debug(json.dumps(twts, ensure_ascii=False))
         | 
| 125 | 
             
                        tms = []
         | 
| 126 | 
             
                        for tk, w in sorted(twts, key=lambda x: x[1] * -1):
         | 
| @@ -140,17 +140,24 @@ class FulltextQueryer: | |
| 140 | 
             
                            sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
         | 
| 141 | 
             
                            sm = [m for m in sm if len(m) > 1]
         | 
| 142 |  | 
| 143 | 
            -
                            keywords | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
                                break
         | 
| 147 |  | 
| 148 | 
             
                            tk_syns = self.syn.lookup(tk)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 149 | 
             
                            tk = FulltextQueryer.subSpecialChar(tk)
         | 
| 150 | 
             
                            if tk.find(" ") > 0:
         | 
| 151 | 
             
                                tk = '"%s"' % tk
         | 
| 152 | 
             
                            if tk_syns:
         | 
| 153 | 
            -
                                tk = f"({tk} %s)" % " ".join(tk_syns)
         | 
| 154 | 
             
                            if sm:
         | 
| 155 | 
             
                                tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
         | 
| 156 | 
             
                            if tk.strip():
         | 
| @@ -159,14 +166,14 @@ class FulltextQueryer: | |
| 159 | 
             
                        tms = " ".join([f"({t})^{w}" for t, w in tms])
         | 
| 160 |  | 
| 161 | 
             
                        if len(twts) > 1:
         | 
| 162 | 
            -
                            tms += ' ("%s"~ | 
| 163 | 
             
                        if re.match(r"[0-9a-z ]+$", tt):
         | 
| 164 | 
             
                            tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
         | 
| 165 |  | 
| 166 | 
             
                        syns = " OR ".join(
         | 
| 167 | 
             
                            [
         | 
| 168 | 
            -
                                '"%s" | 
| 169 | 
            -
                                % FulltextQueryer.subSpecialChar( | 
| 170 | 
             
                                for s in syns
         | 
| 171 | 
             
                            ]
         | 
| 172 | 
             
                        )
         | 
|  | |
| 120 | 
             
                        keywords.append(tt)
         | 
| 121 | 
             
                        twts = self.tw.weights([tt])
         | 
| 122 | 
             
                        syns = self.syn.lookup(tt)
         | 
| 123 | 
            +
                        if syns and len(keywords) < 32: keywords.extend(syns)
         | 
| 124 | 
             
                        logging.debug(json.dumps(twts, ensure_ascii=False))
         | 
| 125 | 
             
                        tms = []
         | 
| 126 | 
             
                        for tk, w in sorted(twts, key=lambda x: x[1] * -1):
         | 
|  | |
| 140 | 
             
                            sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
         | 
| 141 | 
             
                            sm = [m for m in sm if len(m) > 1]
         | 
| 142 |  | 
| 143 | 
            +
                            if len(keywords) < 32:
         | 
| 144 | 
            +
                                keywords.append(re.sub(r"[ \\\"']+", "", tk))
         | 
| 145 | 
            +
                                keywords.extend(sm)
         | 
|  | |
| 146 |  | 
| 147 | 
             
                            tk_syns = self.syn.lookup(tk)
         | 
| 148 | 
            +
                            tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
         | 
| 149 | 
            +
                            if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
         | 
| 150 | 
            +
                            tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
         | 
| 151 | 
            +
                            tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                            if len(keywords) >= 32:
         | 
| 154 | 
            +
                                break
         | 
| 155 | 
            +
             | 
| 156 | 
             
                            tk = FulltextQueryer.subSpecialChar(tk)
         | 
| 157 | 
             
                            if tk.find(" ") > 0:
         | 
| 158 | 
             
                                tk = '"%s"' % tk
         | 
| 159 | 
             
                            if tk_syns:
         | 
| 160 | 
            +
                                tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
         | 
| 161 | 
             
                            if sm:
         | 
| 162 | 
             
                                tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
         | 
| 163 | 
             
                            if tk.strip():
         | 
|  | |
| 166 | 
             
                        tms = " ".join([f"({t})^{w}" for t, w in tms])
         | 
| 167 |  | 
| 168 | 
             
                        if len(twts) > 1:
         | 
| 169 | 
            +
                            tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
         | 
| 170 | 
             
                        if re.match(r"[0-9a-z ]+$", tt):
         | 
| 171 | 
             
                            tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
         | 
| 172 |  | 
| 173 | 
             
                        syns = " OR ".join(
         | 
| 174 | 
             
                            [
         | 
| 175 | 
            +
                                '"%s"'
         | 
| 176 | 
            +
                                % rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
         | 
| 177 | 
             
                                for s in syns
         | 
| 178 | 
             
                            ]
         | 
| 179 | 
             
                        )
         |