Spaces:

retailcrmservices
/

omnidesk-ai-test

Runtime error

App Files Files Community

makcrx commited on Aug 7, 2023

Commit

d869f0d

1 Parent(s): 454c118

test keywords

Browse files

Files changed (4) hide show

app.py +19 -5
extract_keywords.py +122 -0
test.ipynb +77 -7
test_keybert.ipynb +224 -0

app.py CHANGED Viewed

@@ -2,18 +2,32 @@ from langchain.vectorstores import FAISS
 from langchain.embeddings import SentenceTransformerEmbeddings
 import gradio as gr
 import reranking
 embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
 db = FAISS.load_local('faiss_qa', embeddings)
 def main(query):
     query = query.lower()
     result_docs = db.similarity_search_with_score(query, k=20)
-    sentences = [doc[0].page_content for doc in result_docs]
-    #print('----------------------------------------------------------------')
-    #for doc in result_docs:
-    #    print(doc[0].metadata['articleId'], ' | ', doc[0].page_content, ' | ', doc[0].metadata['answer'])
-    score, index = reranking.search(query, sentences)
     return result_docs[index][0].metadata['answer'], score, result_docs[index][0].page_content
 demo = gr.Interface(fn=main, inputs="text", outputs=[

 from langchain.embeddings import SentenceTransformerEmbeddings
 import gradio as gr
 import reranking
+from extract_keywords import init_keyword_extractor, extract_keywords
 embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
 db = FAISS.load_local('faiss_qa', embeddings)
+init_keyword_extractor()
 def main(query):
     query = query.lower()
+    query_keywords = set(extract_keywords(query))
     result_docs = db.similarity_search_with_score(query, k=20)
+    if len(query_keywords) > 0:
+        result_docs = filter(lambda doc: len(set(extract_keywords(doc[0].page_content)).intersection(query_keywords)) > 0, result_docs)
+    if len(result_docs) == 0:
+        return 'Ответ не найден', 0, ''
+    if len(result_docs) == 1:
+        score, index = 0, 0
+    else:
+        sentences = [doc[0].page_content for doc in result_docs]
+        #print('----------------------------------------------------------------')
+        #for doc in result_docs:
+        #    print(doc[0].metadata['articleId'], ' | ', doc[0].page_content, ' | ', doc[0].metadata['answer'])
+        score, index = reranking.search(query, sentences)
     return result_docs[index][0].metadata['answer'], score, result_docs[index][0].page_content
 demo = gr.Interface(fn=main, inputs="text", outputs=[

extract_keywords.py ADDED Viewed

	@@ -0,0 +1,122 @@

+def flatten(items, seqtypes=(list, tuple)):
+    try:
+        for i, x in enumerate(items):
+            while isinstance(x, seqtypes):
+                items[i:i+1] = x
+                x = items[i]
+    except IndexError:
+        pass
+    return items
+aliases = [
+  #('canonical name', ['aliases', ...])
+  ('почта россия', ['почта', 'почта рф', 'пр', 'gh']),
+  ('почта россия трекинг', ['пр трекинг', 'почта трекинг', 'пр трэкинг', 'почта трэкинг']),
+  ('реестр почта', ['реестр пр', 'реестр почта россии']),
+  ('реестр пэк', []),
+  ('реквизиты', []),
+  ('пешкарики', []),
+  ('импорт лидов директ', []),
+  ('яндекс доставка экспресс', ['яндекс доставка express', 'яд экспресс', 'ядоставка экспресс']),
+  ('яндекс доставка ndd', ['яд ндд', 'я доставка ндд', 'ядоставка ндд', 'модуль ндд']),
+  ('яндекс метрика', ['яндекс метрика импорт']),
+  ('альфабанк', ['альфа банк', 'alfabank', 'альфа']),
+  ('импорт лидов facebook', ['импорт лидов fb', 'загрузка лидов fb', 'лиды фейсбук', 'импорт лидов фб', 'fb lead']),
+  ('маркетинговые расходы', ['расходы', 'загрузка расходов']),
+  ('cloudpayments', ['клауд', 'клаудпеймент', 'клаудпейментс']),
+  ('robokassa', ['робокасса', 'робокаса']),
+  ('sipuni', ['сипуни', 'сипьюни']),
+  ('mailchimp', ['майлчимп', 'мейлчим', 'мейлчимп']),
+  ('unisender', ['юнисендер']),
+  ('яндекс аудитории', ['экспорт аудитории', 'экспорт яндекс аудитории']),
+  ('экспорт facebook', ['экспорт сегментов facebook', 'экспорт fb', 'экспорт фейсбук', 'экспорт аудиторий фб', 'fb экспорт']),
+  ('экспорт вк', ['экспорт сегментов vkontakte', 'экспорт vk', 'экспорт контакте'])
+]
+vocab_raw = flatten([[k] + keywords for k, keywords in aliases])
+import string
+import pymorphy3
+morph = None
+def normalize_word(word):
+    if word == 'лид':
+        return word
+    global morph
+    if morph is None:
+        morph = pymorphy3.MorphAnalyzer()
+    return morph.parse(word)[0].normal_form
+def tokenize_sentence(text):
+    # remove punctuation
+    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
+    # tokenize
+    return [normalize_word(word) for word in text.split()]
+def normalize_sentence(text):
+    return " ".join(tokenize_sentence(text))
+def canonical_keywords(keywords):
+    """
+    replace keyword aliases with canonical keyword names
+    """
+    result = []
+    for k in keywords:
+        k = normalize_sentence(k)
+        for canonical_name, alias_names in aliases:
+            canonical_name = normalize_sentence(canonical_name)
+            for a in alias_names:
+                a = normalize_sentence(a)
+                #print('a', a)
+                if a == k:
+                    result.append(canonical_name)
+                    break
+            else:
+                continue
+            break
+        else:
+            result.append(k)
+    return result
+def merge_keywords(keywords):
+    """
+    remove subkeywords
+    """
+    result = []
+    sorted_keywords = sorted(keywords, key=len, reverse=True)
+    for k in sorted_keywords:
+        for rk in result:
+            if rk.lower().startswith(k):
+                break
+        else:
+            result.append(k)
+            continue
+    return result
+vectorizer = None
+kw_model = None
+def init_keyword_extractor():
+    global vectorizer
+    global kw_model
+    from keybert import KeyBERT
+    import spacy
+    from sklearn.feature_extraction.text import CountVectorizer
+    kw_model = KeyBERT(model=spacy.load("ru_core_news_sm", exclude=['tokenizer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']))
+    vocab = [" ".join(tokenize_sentence(s)) for s in vocab_raw]
+    vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary=vocab, tokenizer=tokenize_sentence)
+def extract_keywords(text):
+    global vectorizer
+    global kw_model
+    if vectorizer is None or kw_model is None:
+        init_keyword_extractor()
+    keywords = [k for k, score in kw_model.extract_keywords(text, vectorizer=vectorizer)]
+    return merge_keywords(canonical_keywords(keywords))

test.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,17 +64,87 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "embeddings = SentenceTransformerEmbeddings(model_name=\"multi-qa-MiniLM-L6-cos-v1\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "output_dir = 'faiss_qa'"
@@ -82,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-07 17:36:37.358149: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
    "source": [
+    "from extract_keywords import canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n",
+    "init_keyword_extractor()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['почта россия трекинг']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extract_keywords('пр трекинг')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/tmp/ipykernel_1594240/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">2036088539.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000; font-style: italic\">[Errno 2] No such file or directory: '/tmp/ipykernel_1594240/2036088539.py'</span>                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">NameError: </span>name <span style=\"color: #008000; text-decoration-color: #008000\">'SentenceTransformerEmbeddings'</span> is not defined\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/tmp/ipykernel_1594240/\u001b[0m\u001b[1;33m2036088539.py\u001b[0m:\u001b[94m1\u001b[0m in \u001b[92m<module>\u001b[0m                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[3;31m[Errno 2] No such file or directory: '/tmp/ipykernel_1594240/2036088539.py'\u001b[0m                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mNameError: \u001b[0mname \u001b[32m'SentenceTransformerEmbeddings'\u001b[0m is not defined\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "embeddings = SentenceTransformerEmbeddings(model_name=\"multi-qa-MiniLM-L6-cos-v1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
    "outputs": [],
    "source": [
     "output_dir = 'faiss_qa'"
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [

test_keybert.ipynb ADDED Viewed

	@@ -0,0 +1,224 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc = 'как подключить модуль почту россии трекинг'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keybert import KeyBERT\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "import spacy\n",
+    "nlp = spacy.load(\"ru_core_news_sm\", exclude=['tokenizer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])\n",
+    "kw_model = KeyBERT(model=nlp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "string.punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "\n",
+    "def tokenize_sentence(text):\n",
+    "    # remove punctuation\n",
+    "    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n",
+    "    # tokenize\n",
+    "    return [morph.parse(word)[0].normal_form for word in text.split()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "почта россии\n",
+      "почта\n",
+      "почта россии трекинг\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_raw = [\n",
+    "  'почта россии', 'почта', 'почта россии трекинг',\n",
+    "  'яндекс доставка', 'яндекс доставка экспресс', 'яндекс доставка express',\n",
+    "  'альфабанк', 'альфа банк',\n",
+    "]\n",
+    "aliases = [\n",
+    "  #('canonical name', ['aliases', ...])\n",
+    "  ('почта россии', ['почта']),\n",
+    "  ('яндекс доставка экспресс', ['яндекс доставка express']),\n",
+    "  ('альфабанк', ['альфа банк']),\n",
+    "]\n",
+    "vocab = [\" \".join(tokenize_sentence(s)) for s in vocab_raw]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "как подключить модуль почту  россии трекинг\n",
+      "как подключить модуль почту  россии трекинг\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('почта россия трекинг', 0.4786), ('почта россия', 0.3053), ('почта', 0.2357)]"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
+    "#vectorizer = KeyphraseCountVectorizer(spacy_pipeline='ru_core_news_sm', vocabulary=vocab)\n",
+    "vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary=vocab, tokenizer=tokenize_sentence)\n",
+    "kw_model.extract_keywords(doc, vectorizer=vectorizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pymorphy3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "morph = pymorphy3.MorphAnalyzer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'почту россия'"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "morph.parse('почту')[0].normal_form"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['почта', 'россия', 'трекинг']"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenize_sentence('Почта России? трекинг')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}