Fix some bugs in text2sql.(#4279)(#4281) (#4280)
Browse filesFix some bugs in text2sql.(#4279)(#4281)
### What problem does this PR solve?
- The incorrect results in parsing CSV files of the QA knowledge base in
the text2sql scenario. Process CSV files using the csv library. Decouple
CSV parsing from TXT parsing
- Most llm return results in markdown format ```sql query ```, Fix
execution error caused by LLM output SQLmarkdown format.### Type of
change
- [x] Bug Fix (non-breaking change which fixes an issue)
- agent/component/exesql.py +8 -12
- rag/app/qa.py +34 -2
    	
        agent/component/exesql.py
    CHANGED
    
    | @@ -65,20 +65,16 @@ class ExeSQL(ComponentBase, ABC): | |
| 65 | 
             
                    self._loop += 1
         | 
| 66 |  | 
| 67 | 
             
                    ans = self.get_input()
         | 
| 68 | 
            -
                  
         | 
| 69 | 
            -
             | 
| 70 | 
             
                    ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
         | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
                        else:
         | 
| 78 | 
            -
                            print("no markdown")
         | 
| 79 | 
            -
                        ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
         | 
| 80 | 
             
                    else:
         | 
| 81 | 
            -
                         | 
|  | |
| 82 | 
             
                    ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
         | 
| 83 | 
             
                    ans = re.sub(r';[^;]*$', r';', ans)
         | 
| 84 | 
             
                    if not ans:
         | 
|  | |
| 65 | 
             
                    self._loop += 1
         | 
| 66 |  | 
| 67 | 
             
                    ans = self.get_input()
         | 
|  | |
|  | |
| 68 | 
             
                    ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    # improve the information extraction, most llm return results in markdown format ```sql query ```
         | 
| 71 | 
            +
                    match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
         | 
| 72 | 
            +
                    if match:
         | 
| 73 | 
            +
                        ans = match.group(1)  # Query content
         | 
| 74 | 
            +
                        print(ans)
         | 
|  | |
|  | |
|  | |
| 75 | 
             
                    else:
         | 
| 76 | 
            +
                        print("no markdown")
         | 
| 77 | 
            +
                    ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
         | 
| 78 | 
             
                    ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
         | 
| 79 | 
             
                    ans = re.sub(r';[^;]*$', r';', ans)
         | 
| 80 | 
             
                    if not ans:
         | 
    	
        rag/app/qa.py
    CHANGED
    
    | @@ -12,6 +12,7 @@ | |
| 12 | 
             
            #
         | 
| 13 | 
             
            import logging
         | 
| 14 | 
             
            import re
         | 
|  | |
| 15 | 
             
            from copy import deepcopy
         | 
| 16 | 
             
            from io import BytesIO
         | 
| 17 | 
             
            from timeit import default_timer as timer
         | 
| @@ -25,7 +26,6 @@ from docx import Document | |
| 25 | 
             
            from PIL import Image
         | 
| 26 | 
             
            from markdown import markdown
         | 
| 27 |  | 
| 28 | 
            -
             | 
| 29 | 
             
            class Excel(ExcelParser):
         | 
| 30 | 
             
                def __call__(self, fnm, binary=None, callback=None):
         | 
| 31 | 
             
                    if not binary:
         | 
| @@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |
| 320 | 
             
                        res.append(beAdoc(deepcopy(doc), q, a, eng))
         | 
| 321 | 
             
                    return res
         | 
| 322 |  | 
| 323 | 
            -
                elif re.search(r"\.(txt | 
| 324 | 
             
                    callback(0.1, "Start to parse.")
         | 
| 325 | 
             
                    txt = get_text(filename, binary)
         | 
| 326 | 
             
                    lines = txt.split("\n")
         | 
| @@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |
| 359 |  | 
| 360 | 
             
                    return res
         | 
| 361 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 362 | 
             
                elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         | 
| 363 | 
             
                    callback(0.1, "Start to parse.")
         | 
| 364 | 
             
                    pdf_parser = Pdf()
         | 
|  | |
| 12 | 
             
            #
         | 
| 13 | 
             
            import logging
         | 
| 14 | 
             
            import re
         | 
| 15 | 
            +
            import csv
         | 
| 16 | 
             
            from copy import deepcopy
         | 
| 17 | 
             
            from io import BytesIO
         | 
| 18 | 
             
            from timeit import default_timer as timer
         | 
|  | |
| 26 | 
             
            from PIL import Image
         | 
| 27 | 
             
            from markdown import markdown
         | 
| 28 |  | 
|  | |
| 29 | 
             
            class Excel(ExcelParser):
         | 
| 30 | 
             
                def __call__(self, fnm, binary=None, callback=None):
         | 
| 31 | 
             
                    if not binary:
         | 
|  | |
| 320 | 
             
                        res.append(beAdoc(deepcopy(doc), q, a, eng))
         | 
| 321 | 
             
                    return res
         | 
| 322 |  | 
| 323 | 
            +
                elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
         | 
| 324 | 
             
                    callback(0.1, "Start to parse.")
         | 
| 325 | 
             
                    txt = get_text(filename, binary)
         | 
| 326 | 
             
                    lines = txt.split("\n")
         | 
|  | |
| 359 |  | 
| 360 | 
             
                    return res
         | 
| 361 |  | 
| 362 | 
            +
                elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
         | 
| 363 | 
            +
                    callback(0.1, "Start to parse.")
         | 
| 364 | 
            +
                    txt = get_text(filename, binary)
         | 
| 365 | 
            +
                    lines = txt.split("\n")
         | 
| 366 | 
            +
                    delimiter = "\t" if any("\t" in line for line in lines) else ","
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                    fails = []
         | 
| 369 | 
            +
                    question, answer = "", ""
         | 
| 370 | 
            +
                    res = []
         | 
| 371 | 
            +
                    reader = csv.reader(lines, delimiter=delimiter)
         | 
| 372 | 
            +
             | 
| 373 | 
            +
                    for i, row in enumerate(reader):
         | 
| 374 | 
            +
                        if len(row) != 2:
         | 
| 375 | 
            +
                            if question:
         | 
| 376 | 
            +
                                answer += "\n" + lines[i]
         | 
| 377 | 
            +
                            else:
         | 
| 378 | 
            +
                                fails.append(str(i + 1))
         | 
| 379 | 
            +
                        elif len(row) == 2:
         | 
| 380 | 
            +
                            if question and answer:
         | 
| 381 | 
            +
                                res.append(beAdoc(deepcopy(doc), question, answer, eng))
         | 
| 382 | 
            +
                            question, answer = row
         | 
| 383 | 
            +
                        if len(res) % 999 == 0:
         | 
| 384 | 
            +
                            callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
         | 
| 385 | 
            +
                                f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
         | 
| 386 | 
            +
             | 
| 387 | 
            +
                    if question:
         | 
| 388 | 
            +
                        res.append(beAdoc(deepcopy(doc), question, answer, eng))
         | 
| 389 | 
            +
             | 
| 390 | 
            +
                    callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
         | 
| 391 | 
            +
                        f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
         | 
| 392 | 
            +
                    return res
         | 
| 393 | 
            +
             | 
| 394 | 
             
                elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         | 
| 395 | 
             
                    callback(0.1, "Start to parse.")
         | 
| 396 | 
             
                    pdf_parser = Pdf()
         | 
