Spaces:

pszemraj
/

ballpark-trivia

Runtime error

App Files Files Community

Peter Szemraj commited on Feb 2, 2022

Commit

f28496d

•

1 Parent(s): 31f09ae

:sparkles: upgrade to v2 code

Browse files

Files changed (7) hide show

ai_single_response.py +0 -324
app.py +97 -71
grammar_improve.py +34 -15
requirements.txt +1 -1
symspell_rsc/frequency_bigramdictionary_en_243_342.txt +0 -0
symspell_rsc/frequency_dictionary_en_82_765.txt +0 -0
utils.py +20 -1

ai_single_response.py DELETED Viewed

@@ -1,324 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-ai_single_response.py
-An executable way to call the model. example:
-*\gpt2_chatbot> python ai_single_response.py --model "GPT2_conversational_355M_WoW10k" --prompt "hey, what's up?" --time
-query_gpt_model is used throughout the code, and is the "fundamental" building block of the bot and how everything works. Test this function with a few different models.
-"""
-from aitextgen import aitextgen
-import argparse
-import pprint as pp
-import sys
-import time
-import warnings
-from datetime import datetime
-from pathlib import Path
-from grammar_improve import remove_trailing_punctuation
-from utils import print_spacer, cleantxt_wrap
-warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
-def extract_response(full_resp: list, plist: list, verbose: bool = False):
-    """
-    extract_response - helper fn for ai_single_response.py. By default aitextgen returns the prompt and the response, we just want the response
-    Args:
-        full_resp (list): a list of strings, each string is a response
-        plist (list): a list of strings, each string is a prompt
-        verbose (bool, optional): 4 debug. Defaults to False.
-    """
-    full_resp = [cleantxt_wrap(ele) for ele in full_resp]
-    plist = [cleantxt_wrap(pr) for pr in plist]
-    p_len = len(plist)
-    assert (
-        len(full_resp) >= p_len
-    ), "model output should have as many lines or longer as the input."
-    if set(plist).issubset(full_resp):
-        del full_resp[:p_len]  # remove the prompts from the responses
-    else:
-        print("the isolated responses are:\n")
-        pp.pprint(full_resp)
-        print_spacer()
-        print("the input prompt was:\n")
-        pp.pprint(plist)
-        print_spacer()
-        sys.exit("Exiting: some prompts not found in the responses")
-    if verbose:
-        print("the isolated responses are:\n")
-        pp.pprint(full_resp)
-        print_spacer()
-        print("the input prompt was:\n")
-        pp.pprint(plist)
-        print_spacer()
-    return full_resp  # list of only the model generated responses
-def get_bot_response(
-    name_resp: str, model_resp: str, name_spk: str, verbose: bool = False
-):
-    """
-    get_bot_response  - from the model response, extract the bot response. This is needed because depending on the generation length the model may return more than one response.
-    Args:   name_resp (str): the name of the responder
-    model_resp (str): the model response
-    verbose (bool, optional): 4 debug. Defaults to False.
-    returns: fn_resp (list of str)
-    """
-    fn_resp = []
-    name_counter = 0
-    break_safe = False
-    for resline in model_resp:
-        if resline.startswith(name_resp):
-            name_counter += 1
-            break_safe = True  # know the line is from bot as this line starts with the name of the bot
-            continue
-        if name_spk is not None and name_spk.lower() in resline.lower():
-            break
-        if ":" in resline and name_counter > 0:
-            if break_safe:
-                # we know this is a response from the bot even tho ':' is in the line
-                fn_resp.append(resline)
-                break_safe = False
-            else:
-                # we do not know this is a response from the bot. could be name of another person.. bot is "finished" response
-                break
-        else:
-            fn_resp.append(resline)
-            break_safe = False
-    if verbose:
-        print("the full response is:\n")
-        print("\n".join(fn_resp))
-    return fn_resp
-def query_gpt_model(
-    prompt_msg: str,
-    speaker=None,
-    responder=None,
-    resp_length=64,
-    resp_min=10,
-    kparam=50,
-    temp=0.75,
-    top_p=0.90,
-    batch_size=1,
-    verbose=False,
-    use_gpu=False,
-    nbeams=1,
-):
-    """
-    query_gpt_model - the main function that calls the model.
-    Parameters:
-    -----------
-    prompt_msg (str): the prompt to be sent to the model
-    speaker (str, optional): the name of the speaker. Defaults to None.
-    responder (str, optional): the name of the responder. Defaults to None.
-    resp_length (int, optional): the length of the response. Defaults to 64.
-    resp_min (int, optional): the minimum length of the response. Defaults to 4.
-    kparam (int, optional): the k parameter for the top_p. Defaults to 150.
-    temp (float, optional): the temperature for the top_p. Defaults to 0.75.
-    top_p (float, optional): the top_p parameter for the top_p. Defaults to 0.65.
-    verbose (bool, optional): 4 debug. Defaults to False.
-    use_gpu (bool, optional): use gpu. Defaults to False.
-    nbeams (int, optional): the number of beams to search and return best value. Defaults to 1.
-    """
-    from aitextgen.utils import GPT2ConfigCPU
-    ai = aitextgen(
-        model="pszemraj/Ballpark-Trivia-L",  # THIS WORKS. XL is not working
-        to_gpu=use_gpu,
-    )
-    p_list = []  # track conversation
-    p_list.append(speaker.lower() + ":" + "\n")
-    p_list.append(prompt_msg.lower() + "\n")
-    p_list.append("\n")
-    p_list.append(responder.lower() + ":" + "\n")
-    this_prompt = "".join(p_list)
-    pr_len = len(this_prompt)
-    if verbose:
-        print("overall prompt:\n")
-        pp.pprint(this_prompt, indent=4)
-    # call the model
-    print("\n... generating...")
-    this_result = ai.generate(
-        n=1,
-        batch_size=batch_size,
-        # the prompt input counts for text length constraints
-        max_length=resp_length + pr_len,
-        min_length=resp_min + pr_len,
-        prompt=this_prompt,
-        top_k=kparam,
-        top_p=top_p,
-        do_sample=True,
-        return_as_list=True,
-        n_beams=nbeams,
-        temperature=temp,
-        verbose=True,  # in this case verbose is just to enable huggingface logging
-        use_cache=True,
-    )
-    if verbose:
-        print("\n... generated:\n")
-        pp.pprint(this_result)  # for debugging
-    # process the full result to get the ~bot response~ piece
-    this_result = str(this_result[0]).split(
-        "\n"
-    )  # TODO: adjust hardcoded value for index to dynamic (if n>1)
-    og_res = this_result.copy()
-    og_prompt = p_list.copy()
-    diff_list = extract_response(
-        this_result, p_list, verbose=verbose
-    )  # isolate the responses from the prompts
-    # extract the bot response from the model generated text
-    bot_dialogue = get_bot_response(
-        name_resp=responder, model_resp=diff_list, name_spk=speaker, verbose=verbose
-    )
-    print(f"DEBUG: {bot_dialogue} was original response pre-SC")
-    bot_resp = ", ".join(bot_dialogue)
-    bot_resp = bot_resp.strip()
-    # remove the last ',' '.' chars
-    bot_resp = remove_trailing_punctuation(bot_resp)
-    if verbose:
-        print("\n... bot response:\n")
-        pp.pprint(bot_resp)
-    og_prompt.append(bot_resp + "\n")
-    og_prompt.append("\n")
-    print("\nfinished!")
-    # return the bot response and the full conversation
-    return {"out_text": bot_resp, "full_conv": og_prompt}  # model responses
-# Set up the parsing of command-line arguments
-def get_parser():
-    """
-    get_parser a helper function for the argparse module, relevant if this is run as a script.
-    """
-    parser = argparse.ArgumentParser(
-        description="submit a message and have a pretrained GPT model respond"
-    )
-    parser.add_argument(
-        "--prompt",
-        required=True,  # MUST HAVE A PROMPT
-        type=str,
-        help="the message the bot is supposed to respond to. Prompt is said by speaker, answered by responder.",
-    )
-    parser.add_argument(
-        "--model",
-        required=False,
-        type=str,
-        default="GPT2_trivNatQAdailydia_774M_175Ksteps",
-        help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
-        "config.json). No models? Run the script download_models.py",
-    )
-    parser.add_argument(
-        "--speaker",
-        required=False,
-        default=None,
-        help="Who the prompt is from (to the bot). Primarily relevant to bots trained on multi-individual chat data",
-    )
-    parser.add_argument(
-        "--responder",
-        required=False,
-        default="person beta",
-        help="who the responder is. Primarily relevant to bots trained on multi-individual chat data",
-    )
-    parser.add_argument(
-        "--topk",
-        required=False,
-        type=int,
-        default=150,
-        help="how many responses to sample (positive integer). lower = more random responses",
-    )
-    parser.add_argument(
-        "--temp",
-        required=False,
-        type=float,
-        default=0.75,
-        help="specify temperature hyperparam (0-1). roughly considered as 'model creativity'",
-    )
-    parser.add_argument(
-        "--topp",
-        required=False,
-        type=float,
-        default=0.65,
-        help="nucleus sampling frac (0-1). aka: what fraction of possible options are considered?",
-    )
-    parser.add_argument(
-        "--verbose",
-        default=False,
-        action="store_true",
-        help="pass this argument if you want all the printouts",
-    )
-    parser.add_argument(
-        "--time",
-        default=False,
-        action="store_true",
-        help="pass this argument if you want to know runtime",
-    )
-    return parser
-if __name__ == "__main__":
-    # parse the command line arguments
-    args = get_parser().parse_args()
-    query = args.prompt
-    model_dir = str(args.model)
-    model_loc = Path.cwd() / model_dir
-    spkr = args.speaker
-    rspndr = args.responder
-    k_results = args.topk
-    my_temp = args.temp
-    my_top_p = args.topp
-    want_verbose = args.verbose
-    want_rt = args.time
-    st = time.perf_counter()
-    resp = query_gpt_model(
-        folder_path=model_loc,
-        prompt_msg=query,
-        speaker=spkr,
-        responder=rspndr,
-        kparam=k_results,
-        temp=my_temp,
-        top_p=my_top_p,
-        verbose=want_verbose,
-        use_gpu=False,
-    )
-    output = resp["out_text"]
-    pp.pprint(output, indent=4)
-    rt = round(time.perf_counter() - st, 1)
-    if want_rt:
-        print("took {runtime} seconds to generate. \n".format(runtime=rt))
-    if want_verbose:
-        print("finished - ", datetime.now())
-        p_list = resp["full_conv"]
-        print("A transcript of your chat is as follows: \n")
-        p_list = [item.strip() for item in p_list]
-        pp.pprint(p_list)

app.py CHANGED Viewed

@@ -1,19 +1,9 @@
 """
-deploy-as-bot\gradio_chatbot.py
-A system, method for deploying to Gradio. Gradio is a basic "deploy" interface which allows for other users to test your model from a web URL. It also enables some basic functionality like user flagging for weird responses.
 """
-from flask import (
-    Flask,
-    request,
-    session,
-    jsonify,
-    abort,
-    send_file,
-    render_template,
-    redirect,
-)
-from ai_single_response import query_gpt_model
-from datetime import datetime
 from transformers import pipeline
 from cleantext import clean
 from pathlib import Path
@@ -26,7 +16,7 @@ import os
 import sys
 from os.path import dirname
 import nltk
-from aitextgen import aitextgen
 from grammar_improve import (
     detect_propers,
     load_ns_checker,
@@ -35,6 +25,7 @@ from grammar_improve import (
     remove_trailing_punctuation,
     build_symspell_obj,
     symspeller,
 )
 from utils import (
@@ -46,46 +37,78 @@ nltk.download("stopwords")  # TODO: find where this requirement originates from
 sys.path.append(dirname(dirname(os.path.abspath(__file__))))
 warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
 logging.basicConfig()
 cwd = Path.cwd()
 my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
-def load_model(model_name=None, use_gpu=False):
-    _name = "pszemraj/Ballpark-Trivia-L" if model_name is None else model_name
-    print(f"\nloading model: {_name}\n")
-    ai = aitextgen(
-        model=_name,
-        to_gpu=use_gpu,
-    )
-def ask_gpt(message: str):
     """
-    ask_gpt - queries the relevant model with a prompt message and returns the response.
-                NOTE: because this is for models trained with person alpha and person beta,
-                there is no need for customizing / changing the name settings and so on
-    Args:
-        message (str): prompt message to respond to, usually a question
-    Returns:
-        [str]: [model response as a string]
     """
     st = time.perf_counter()
     prompt = clean(message)  # clean user input
     prompt = prompt.strip()  # get rid of any extra whitespace
-    if len(prompt) > 200:
-        prompt = prompt[-200:]  # truncateblack
-    resp = query_gpt_model(
-        prompt_msg=prompt,
-        speaker="person alpha",
-        responder="person beta",
-        kparam=30,
-        top_p=0.9,
-        batch_size=1,
-        nbeams=1,
-        # TODO - allow users to adjust these 4 da memes
-    )  # using top_P and top_k to avoid the "too many hypotheses" error, not using temp
     rawtxt = resp["out_text"]
     # check for proper nouns
     if basic_sc and not detect_propers(rawtxt):
@@ -95,26 +118,16 @@ def ask_gpt(message: str):
     else:
         # no correction needed
         cln_resp = rawtxt.strip()
-    bot_resp = corr(remove_repeated_words(cln_resp))
-    print(f"the prompt was:\n {message} and the response was:\n {bot_resp}\n")
-    rt = round(time.perf_counter() - st, 2)
-    print(f"took {rt} sec to respond\n")
     return remove_trailing_punctuation(bot_resp)
-def chat(trivia_query):
-    history = []
-    response = ask_gpt(trivia_query)
-    history = [trivia_query, response]
-    html = ""
-    for item in history:
-        html += f"<b>{item}</b> <br><br>"
-    html += ""
-    return html
 def get_parser():
     """
     get_parser - a helper function for the argparse module
@@ -127,9 +140,8 @@ def get_parser():
         "--model",
         required=False,
         type=str,
-        default="pszemraj/Ballpark-Trivia-L",
-        help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
-        "config.json)",
     )
     parser.add_argument(
         "--basic-sc",
@@ -139,21 +151,35 @@ def get_parser():
         help="turn on symspell (baseline) correction instead of the more advanced neural net models",
     )
     return parser
 if __name__ == "__main__":
     args = get_parser().parse_args()
     default_model = str(args.model)
-    load_model(default_model)
-    model_loc = cwd.parent / default_model
-    model_loc = str(model_loc.resolve())
-    basic_sc = args.basic_sc
     if basic_sc:
-        print("defaulting to symspell for spell checking")
         schnellspell = build_symspell_obj()
     else:
-        print("using advanced spell checker (Neuspell)")
         ns_checker = load_ns_checker(fast=False)
     print(f"using model stored here: \n {model_loc} \n")
@@ -188,7 +214,7 @@ if __name__ == "__main__":
         ],
         title=f"Ballpark Trivia: {default_model} Model",
         description=f"Are you frequently asked google-able Trivia questions and annoyed by it? Well, this is the app for you! Ballpark Trivia Bot answers any trivia question with something that sounds plausible but is probably not 100% correct. \n\n One might say.. the answers are in the right ballpark.",
-        article="Further details can be found in the [model card](https://huggingface.co/pszemraj/Ballpark-Trivia-L).  If you are interested in a more deceptively incorrect model, there is also [an XL version](https://huggingface.co/pszemraj/Ballpark-Trivia-XL) on my page.\n\n"
         "**Important Notes & About:**\n\n"
         "1. the model can take up to 60 seconds to respond sometimes, patience is a virtue.\n"
         "2. the model started from a pretrained checkpoint, and was trained on several different datasets. Anything it says should be fact-checked before being regarded as a true statement.\n"
@@ -209,4 +235,4 @@ if __name__ == "__main__":
         # prevent_thread_lock=True,
         # share=True,
         enable_queue=True,  # also allows for dealing with multiple users simultaneously (per newer gradio version)
-    )

 """
+app.py - the main file for the app. This creates the flask app and handles the routes.
 """
+import torch
 from transformers import pipeline
 from cleantext import clean
 from pathlib import Path
 import sys
 from os.path import dirname
 import nltk
+from converse import discussion
 from grammar_improve import (
     detect_propers,
     load_ns_checker,
     remove_trailing_punctuation,
     build_symspell_obj,
     symspeller,
+    fix_punct_spacing,
 )
 from utils import (
 sys.path.append(dirname(dirname(os.path.abspath(__file__))))
 warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
+import transformers
+transformers.logging.set_verbosity_error()
 logging.basicConfig()
 cwd = Path.cwd()
 my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
+def chat(trivia_query):
+    """
+    chat - a function that takes in a trivia query and returns a response
+    """
+    history = []
+    response = ask_gpt(trivia_query)
+    history = [trivia_query, response]
+    html = ""
+    for item in history:
+        html += f"<b>{item}</b> <br><br>"
+    html += ""
+    return html
+def ask_gpt(
+    message: str,
+    chat_pipe,
+    speaker="person alpha",
+    responder="person beta",
+    max_len=196,
+    top_p=0.95,
+    top_k=50,
+    temperature=0.6,
+):
     """
+    ask_gpt - a function that takes in a prompt and generates a response using the pipeline. This interacts the discussion function.
+    Parameters:
+        message (str): the question to ask the bot
+        chat_pipe (str): the chat_pipe to use for the bot (default: "pszemraj/Ballpark-Trivia-XL")
+        speaker (str): the name of the speaker (default: "person alpha")
+        responder (str): the name of the responder (default: "person beta")
+        max_len (int): the maximum length of the response (default: 128)
+        top_p (float): the top probability threshold (default: 0.95)
+        top_k (int): the top k threshold (default: 50)
+        temperature (float): the temperature of the response (default: 0.7)
     """
     st = time.perf_counter()
     prompt = clean(message)  # clean user input
     prompt = prompt.strip()  # get rid of any extra whitespace
+    in_len = len(prompt)
+    if in_len > 512:
+        prompt = prompt[-512:]  # truncate to 512 chars
+        print(f"Truncated prompt to last 512 chars: started with {in_len} chars")
+        max_len = min(max_len, 512)
+    resp = discussion(
+        prompt_text=prompt,
+        pipeline=chat_pipe,
+        speaker=speaker,
+        responder=responder,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        max_length=max_len,
+    )
+    gpt_et = time.perf_counter()
+    gpt_rt = round(gpt_et - st, 2)
     rawtxt = resp["out_text"]
     # check for proper nouns
     if basic_sc and not detect_propers(rawtxt):
     else:
         # no correction needed
         cln_resp = rawtxt.strip()
+    bot_resp_a = corr(remove_repeated_words(cln_resp))
+    bot_resp = fix_punct_spacing(bot_resp_a)
+    print(f"the prompt was:\n\t{message}\nand the response was:\n\t{bot_resp}\n")
+    corr_rt = round(time.perf_counter() - gpt_et, 4)
+    print(
+        f"took {gpt_rt + corr_rt} sec to respond, {gpt_rt} for GPT, {corr_rt} for correction\n"
+    )
     return remove_trailing_punctuation(bot_resp)
 def get_parser():
     """
     get_parser - a helper function for the argparse module
         "--model",
         required=False,
         type=str,
+        default="pszemraj/Ballpark-Trivia-XL",  # default model
+        help="the model to use for the chatbot on https://huggingface.co/models OR a path to a local model",
     )
     parser.add_argument(
         "--basic-sc",
         help="turn on symspell (baseline) correction instead of the more advanced neural net models",
     )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="turn on verbose logging",
+    )
     return parser
 if __name__ == "__main__":
     args = get_parser().parse_args()
     default_model = str(args.model)
+    model_loc = Path(default_model)  # if the model is a path, use it
+    basic_sc = args.basic_sc  # whether to use the baseline spellchecker
+    device = 0 if torch.cuda.is_available() else -1
+    print(f"CUDA avail is {torch.cuda.is_available()}")
+    my_chatbot = (
+        pipeline("text-generation", model=model_loc.resolve(), device=device)
+        if model_loc.exists() and model_loc.is_dir()
+        else pipeline("text-generation", model=default_model, device=device)
+    )  # if the model is a name, use it. stays on CPU if no GPU available
+    print(f"using model {my_chatbot.model}")
     if basic_sc:
+        print("Using the baseline spellchecker")
         schnellspell = build_symspell_obj()
     else:
+        print("using Neuspell spell checker")
         ns_checker = load_ns_checker(fast=False)
     print(f"using model stored here: \n {model_loc} \n")
         ],
         title=f"Ballpark Trivia: {default_model} Model",
         description=f"Are you frequently asked google-able Trivia questions and annoyed by it? Well, this is the app for you! Ballpark Trivia Bot answers any trivia question with something that sounds plausible but is probably not 100% correct. \n\n One might say.. the answers are in the right ballpark.",
+        article="Further details can be found in the [model card](https://huggingface.co/pszemraj/Ballpark-Trivia-XL).\n\n"
         "**Important Notes & About:**\n\n"
         "1. the model can take up to 60 seconds to respond sometimes, patience is a virtue.\n"
         "2. the model started from a pretrained checkpoint, and was trained on several different datasets. Anything it says should be fact-checked before being regarded as a true statement.\n"
         # prevent_thread_lock=True,
         # share=True,
         enable_queue=True,  # also allows for dealing with multiple users simultaneously (per newer gradio version)
+    )

grammar_improve.py CHANGED Viewed

@@ -4,6 +4,7 @@ grammar_improve.py - this .py script contains functions to improve the grammar o
 """
 from datetime import datetime
 import pprint as pp
 from neuspell import BertChecker, SclstmChecker
 import neuspell
@@ -11,9 +12,11 @@ import math
 from cleantext import clean
 import time
 import re
 from symspellpy.symspellpy import SymSpell
 def detect_propers(text: str):
     """
@@ -98,6 +101,14 @@ def remove_trailing_punctuation(text: str, fuLL_strip=False):
         return text.strip(".,;:")
 """
 start of SymSpell code
 """
@@ -126,6 +137,11 @@ def symspeller(
         dictionary_path : str, optional, default=None, the path to the dictionary file
         bigram_path : str, optional, default=None, the path to the bigram dictionary file
         verbose : bool, optional, default=False, whether to print the results
     """
     assert len(my_string) > 0, "entered string for correction is empty"
@@ -202,7 +218,8 @@ def build_symspell_obj(
 """
-NEEDED FOR T5
 import torch
 from transformers import T5Tokenizer, T5ForConditionalGeneration
@@ -282,19 +299,21 @@ def load_ns_checker(customckr=None, fast=False):
         [neuspell.NeuSpell]: [neuspell checker object]
     """
     st = time.perf_counter()
-    if customckr is None and not fast:
-        checker = BertChecker(
-            pretrained=True
-        )  # load the default checker, has the best balance
-    elif customckr is None and fast:
-        checker = SclstmChecker(
-            pretrained=True
-        )  # this one is faster but not as accurate
-    else:
-        checker = customckr(pretrained=True)
     rt_min = (time.perf_counter() - st) / 60
     print(f"\n\nloaded checker in {rt_min} minutes")
     return checker
@@ -320,7 +339,7 @@ def neuspell_correct(input_text: str, checker=None, verbose=False):
         return input_text
     if checker is None:
-        print("NOTE - no checker provided, using default checker")
         checker = SclstmChecker(pretrained=True)
     corrected = checker.correct(input_text)

 """
 from datetime import datetime
+import os
 import pprint as pp
 from neuspell import BertChecker, SclstmChecker
 import neuspell
 from cleantext import clean
 import time
 import re
+import sys
 from symspellpy.symspellpy import SymSpell
+from utils import suppress_stdout
 def detect_propers(text: str):
     """
         return text.strip(".,;:")
+def fix_punct_spacing(text: str):
+    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
+    spc_text = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)
+    cln_text = re.sub(r"(\W)(?=\1)", "", spc_text)
+    return cln_text
 """
 start of SymSpell code
 """
         dictionary_path : str, optional, default=None, the path to the dictionary file
         bigram_path : str, optional, default=None, the path to the bigram dictionary file
         verbose : bool, optional, default=False, whether to print the results
+    Returns
+    -------
+        list,
     """
     assert len(my_string) > 0, "entered string for correction is empty"
 """
+# if using t5b_correction to check for spelling errors, use this code to initialize the objects
 import torch
 from transformers import T5Tokenizer, T5ForConditionalGeneration
         [neuspell.NeuSpell]: [neuspell checker object]
     """
     st = time.perf_counter()
+    # stop all printing to the console
+    with suppress_stdout():
+        if customckr is None and not fast:
+            checker = BertChecker(
+                pretrained=True
+            )  # load the default checker, has the best balance
+        elif customckr is None and fast:
+            checker = SclstmChecker(
+                pretrained=True
+            )  # this one is faster but not as accurate
+        else:
+            checker = customckr(pretrained=True)
     rt_min = (time.perf_counter() - st) / 60
+    # return to standard logging level
     print(f"\n\nloaded checker in {rt_min} minutes")
     return checker
         return input_text
     if checker is None:
+        print("NOTE - no checker provided, loading default checker")
         checker = SclstmChecker(pretrained=True)
     corrected = checker.correct(input_text)

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ sentencepiece>=0.1.96
 tqdm>=4.43.0
 symspellpy>=6.7.0
 requests>=2.24.0
-gradio>=2.5.0
 natsort>=7.1.1
 pandas>=1.3.0
 aitextgen>=0.5.2

 tqdm>=4.43.0
 symspellpy>=6.7.0
 requests>=2.24.0
+gradio>=2.4.6
 natsort>=7.1.1
 pandas>=1.3.0
 aitextgen>=0.5.2

symspell_rsc/frequency_bigramdictionary_en_243_342.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

symspell_rsc/frequency_dictionary_en_82_765.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-general utility functions for loading, saving, and manipulating data
 """
 import os
@@ -19,6 +19,25 @@ import pandas as pd
 from tqdm.auto import tqdm
 def remove_string_extras(mytext):
     # removes everything from a string except A-Za-z0-9 .,;
     return re.sub(r"[^A-Za-z0-9 .,;]+", "", mytext)

 """
+    utils - general utility functions for loading, saving, and manipulating data
 """
 import os
 from tqdm.auto import tqdm
+from contextlib import contextmanager
+import sys
+import os
+@contextmanager
+def suppress_stdout():
+    """
+    suppress_stdout - suppress stdout for a given block of code. credit to https://newbedev.com/how-to-suppress-console-output-in-python
+    """
+    with open(os.devnull, "w") as devnull:
+        old_stdout = sys.stdout
+        sys.stdout = devnull
+        try:
+            yield
+        finally:
+            sys.stdout = old_stdout
 def remove_string_extras(mytext):
     # removes everything from a string except A-Za-z0-9 .,;
     return re.sub(r"[^A-Za-z0-9 .,;]+", "", mytext)