Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	make to store functions
Browse files- .gitignore +5 -0
- README.md +3 -3
- config.py +21 -0
- gh_issue_loader.py +67 -0
- requirments.txt +10 -0
- store.py +59 -0
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            .vscode/
         | 
| 2 | 
            +
            __pycache__/
         | 
| 3 | 
            +
            venv/
         | 
| 4 | 
            +
            qdrant_storage/
         | 
| 5 | 
            +
            data/
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,10 +1,10 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
             
            emoji: 🐠
         | 
| 4 | 
             
            colorFrom: green
         | 
| 5 | 
             
            colorTo: purple
         | 
| 6 | 
            -
            sdk:  | 
| 7 | 
            -
            sdk_version:  | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: mit
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Github Issue Search
         | 
| 3 | 
             
            emoji: 🐠
         | 
| 4 | 
             
            colorFrom: green
         | 
| 5 | 
             
            colorTo: purple
         | 
| 6 | 
            +
            sdk: streamlit
         | 
| 7 | 
            +
            sdk_version: 1.25.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: mit
         | 
    	
        config.py
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            SAAS = False
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            def get_db_config():
         | 
| 8 | 
            +
                url = os.environ["QDRANT_URL"]
         | 
| 9 | 
            +
                api_key = os.environ["QDRANT_API_KEY"]
         | 
| 10 | 
            +
                collection_name = "gh-issues"
         | 
| 11 | 
            +
                return url, api_key, collection_name
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            def get_local_db_congin():
         | 
| 15 | 
            +
                url = "localhost"
         | 
| 16 | 
            +
                # api_key = os.environ["QDRANT_API_KEY"]
         | 
| 17 | 
            +
                collection_name = "gh-issues"
         | 
| 18 | 
            +
                return url, None, collection_name
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()
         | 
    	
        gh_issue_loader.py
    ADDED
    
    | @@ -0,0 +1,67 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import dataclass, asdict
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
            from typing import Iterator
         | 
| 4 | 
            +
            from dateutil.parser import parse
         | 
| 5 | 
            +
            from langchain.docstore.document import Document
         | 
| 6 | 
            +
            from langchain.document_loaders.base import BaseLoader
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            @dataclass
         | 
| 10 | 
            +
            class Issue:
         | 
| 11 | 
            +
                repo_name: str
         | 
| 12 | 
            +
                id: int
         | 
| 13 | 
            +
                title: str
         | 
| 14 | 
            +
                created_at: int
         | 
| 15 | 
            +
                user: str
         | 
| 16 | 
            +
                url: str
         | 
| 17 | 
            +
                labels: list[str]
         | 
| 18 | 
            +
                type_: str
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            def date_to_int(dt_str: str) -> int:
         | 
| 22 | 
            +
                dt = parse(dt_str)
         | 
| 23 | 
            +
                return int(dt.timestamp())
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            def get_contents(repo_name: str, filename: str) -> Iterator[tuple[Issue, str]]:
         | 
| 27 | 
            +
                with open(filename, "r") as f:
         | 
| 28 | 
            +
                    obj = [json.loads(line) for line in f]
         | 
| 29 | 
            +
                for data in obj:
         | 
| 30 | 
            +
                    issue = Issue(
         | 
| 31 | 
            +
                        repo_name=repo_name,
         | 
| 32 | 
            +
                        id=data["number"],
         | 
| 33 | 
            +
                        title=data["title"],
         | 
| 34 | 
            +
                        created_at=date_to_int(data["created_at"]),
         | 
| 35 | 
            +
                        user=data["user.login"],
         | 
| 36 | 
            +
                        url=data["html_url"],
         | 
| 37 | 
            +
                        labels=data["labels_"],
         | 
| 38 | 
            +
                        type_="issue",
         | 
| 39 | 
            +
                    )
         | 
| 40 | 
            +
                    yield issue, data["body"]
         | 
| 41 | 
            +
                    comments = data["comments_"]
         | 
| 42 | 
            +
                    for comment in comments:
         | 
| 43 | 
            +
                        issue = Issue(
         | 
| 44 | 
            +
                            repo_name=repo_name,
         | 
| 45 | 
            +
                            id=comment["id"],
         | 
| 46 | 
            +
                            title=data["title"],
         | 
| 47 | 
            +
                            created_at=date_to_int(comment["created_at"]),
         | 
| 48 | 
            +
                            user=comment["user.login"],
         | 
| 49 | 
            +
                            url=comment["html_url"],
         | 
| 50 | 
            +
                            labels=data["labels_"],
         | 
| 51 | 
            +
                            type_="comment",
         | 
| 52 | 
            +
                        )
         | 
| 53 | 
            +
                        yield issue, comment["body"]
         | 
| 54 | 
            +
             | 
| 55 | 
            +
             | 
| 56 | 
            +
            class GHLoader(BaseLoader):
         | 
| 57 | 
            +
                def __init__(self, repo_name: str, filename: str):
         | 
| 58 | 
            +
                    self.repo_name = repo_name
         | 
| 59 | 
            +
                    self.filename = filename
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def lazy_load(self) -> Iterator[Document]:
         | 
| 62 | 
            +
                    for issue, text in get_contents(self.repo_name, self.filename):
         | 
| 63 | 
            +
                        metadata = asdict(issue)
         | 
| 64 | 
            +
                        yield Document(page_content=text, metadata=metadata)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                def load(self) -> list[Document]:
         | 
| 67 | 
            +
                    return list(self.lazy_load())
         | 
    	
        requirments.txt
    ADDED
    
    | @@ -0,0 +1,10 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            langchain
         | 
| 2 | 
            +
            tiktoken
         | 
| 3 | 
            +
            qdrant-client
         | 
| 4 | 
            +
            torch
         | 
| 5 | 
            +
            transformers
         | 
| 6 | 
            +
            accelerate
         | 
| 7 | 
            +
            bitsandbytes
         | 
| 8 | 
            +
            sentence_transformers
         | 
| 9 | 
            +
            streamlit
         | 
| 10 | 
            +
            python-dateutil
         | 
    	
        store.py
    ADDED
    
    | @@ -0,0 +1,59 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         | 
| 2 | 
            +
            from langchain.embeddings import HuggingFaceEmbeddings
         | 
| 3 | 
            +
            from langchain.vectorstores import Qdrant
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from gh_issue_loader import GHLoader
         | 
| 6 | 
            +
            from config import DB_CONFIG
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            CHUNK_SIZE = 500
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            def get_text_chunk(docs):
         | 
| 13 | 
            +
                text_splitter = RecursiveCharacterTextSplitter(
         | 
| 14 | 
            +
                    chunk_size=CHUNK_SIZE, chunk_overlap=0
         | 
| 15 | 
            +
                )
         | 
| 16 | 
            +
                texts = text_splitter.split_documents(docs)
         | 
| 17 | 
            +
                return texts
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def store(texts):
         | 
| 21 | 
            +
                model_name = "intfloat/multilingual-e5-large"
         | 
| 22 | 
            +
                model_kwargs = {"device": "cuda"}
         | 
| 23 | 
            +
                encode_kwargs = {"normalize_embeddings": False}
         | 
| 24 | 
            +
                embeddings = HuggingFaceEmbeddings(
         | 
| 25 | 
            +
                    model_name=model_name,
         | 
| 26 | 
            +
                    model_kwargs=model_kwargs,
         | 
| 27 | 
            +
                    encode_kwargs=encode_kwargs,
         | 
| 28 | 
            +
                )
         | 
| 29 | 
            +
                db_url, db_api_key, db_collection_name = DB_CONFIG
         | 
| 30 | 
            +
                _ = Qdrant.from_documents(
         | 
| 31 | 
            +
                    texts,
         | 
| 32 | 
            +
                    embeddings,
         | 
| 33 | 
            +
                    url=db_url,
         | 
| 34 | 
            +
                    api_key=db_api_key,
         | 
| 35 | 
            +
                    collection_name=db_collection_name,
         | 
| 36 | 
            +
                )
         | 
| 37 | 
            +
             | 
| 38 | 
            +
             | 
| 39 | 
            +
            def main(repo_name: str, path: str) -> None:
         | 
| 40 | 
            +
                loader = GHLoader(repo_name, path)
         | 
| 41 | 
            +
                docs = loader.load()
         | 
| 42 | 
            +
                texts = get_text_chunk(docs)
         | 
| 43 | 
            +
                store(texts)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
            if __name__ == "__main__":
         | 
| 47 | 
            +
                """
         | 
| 48 | 
            +
                $ python store.py "REPO_NAME" "FILE_PATH"
         | 
| 49 | 
            +
                $ python store.py cocoa data/cocoa-issues.json
         | 
| 50 | 
            +
                """
         | 
| 51 | 
            +
                import sys
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                args = sys.argv
         | 
| 54 | 
            +
                if len(args) != 3:
         | 
| 55 | 
            +
                    print("No args, you need two args for repo_name, json_file_path")
         | 
| 56 | 
            +
                else:
         | 
| 57 | 
            +
                    repo_name = args[1]
         | 
| 58 | 
            +
                    path = args[2]
         | 
| 59 | 
            +
                    main(repo_name, path)
         | 

