Spaces:

shimizukawa
/

python-no-senpai

Running

shimizukawa commited on Oct 19, 2023

Commit

8d5b271

1 Parent(s): c1dc2ee

refactoring: move index annotation

Files changed (5) hide show

loaders/github_issue.py CHANGED Viewed

@@ -15,14 +15,13 @@ def date_to_int(dt_str: str) -> int:
     return int(dt.timestamp())
-def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
     with inputfile.open("r") as f:
         obj = [json.loads(line) for line in f]
     for data in obj:
         title = data["title"]
         body = data["body"]
         issue = GithubIssue(
-            index=index,
             id=data["number"],
             title=title,
             ctime=date_to_int(data["created_at"]),
@@ -37,7 +36,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
         comments = data["comments_"]
         for comment in comments:
             issue = GithubIssue(
-                index=index,
                 id=comment["id"],
                 title=data["title"],
                 ctime=date_to_int(comment["created_at"]),
@@ -50,12 +48,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
 class GithubIssueLoader(BaseLoader):
-    def __init__(self, index: str, inputfile: Path):
-        self.index = index
         self.inputfile = inputfile
     def lazy_load(self) -> Iterator[Document]:
-        for issue, text in get_contents(self.index, self.inputfile):
             metadata = asdict(issue)
             yield Document(page_content=text, metadata=metadata)

     return int(dt.timestamp())
+def get_contents(inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
     with inputfile.open("r") as f:
         obj = [json.loads(line) for line in f]
     for data in obj:
         title = data["title"]
         body = data["body"]
         issue = GithubIssue(
             id=data["number"],
             title=title,
             ctime=date_to_int(data["created_at"]),
         comments = data["comments_"]
         for comment in comments:
             issue = GithubIssue(
                 id=comment["id"],
                 title=data["title"],
                 ctime=date_to_int(comment["created_at"]),
 class GithubIssueLoader(BaseLoader):
+    def __init__(self, inputfile: Path):
         self.inputfile = inputfile
     def lazy_load(self) -> Iterator[Document]:
+        for issue, text in get_contents(self.inputfile):
             metadata = asdict(issue)
             yield Document(page_content=text, metadata=metadata)

loaders/rtdhtmlpage.py CHANGED Viewed

@@ -12,8 +12,7 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
     $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
     $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
     """
-    def __init__(self, index: str, inputfile: Path, *args, **kwargs):
-        self.index = index
         kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
         super().__init__(inputfile, *args, **kwargs)
@@ -66,7 +65,6 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
                 "user": "rtd",
                 "type": "rtd",
                 "url": f"https://{str(p)}",
-                "index": self.index,
                 "id": str(p),
             }
             # print(metadata)

     $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
     $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
     """
+    def __init__(self, inputfile: Path, *args, **kwargs):
         kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
         super().__init__(inputfile, *args, **kwargs)
                 "user": "rtd",
                 "type": "rtd",
                 "url": f"https://{str(p)}",
                 "id": str(p),
             }
             # print(metadata)

loaders/wikipage.py CHANGED Viewed

@@ -15,7 +15,7 @@ def date_to_int(dt_str: str) -> int:
     return int(dt.timestamp())
-def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
     """filename for file with ndjson
         {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
@@ -28,7 +28,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
         body = data["content"]
         ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
         doc = WikiPage(
-            index=index,
             id=data["id"],
             title=title,
             ctime=ctime,
@@ -42,12 +41,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
 class WikiPageLoader(BaseLoader):
-    def __init__(self, index: str, inputfile: Path):
-        self.index = index
         self.inputfile = inputfile
     def lazy_load(self) -> Iterator[Document]:
-        for doc, text in get_contents(self.index, self.inputfile):
             metadata = asdict(doc)
             yield Document(page_content=text, metadata=metadata)

     return int(dt.timestamp())
+def get_contents(inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
     """filename for file with ndjson
         {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
         body = data["content"]
         ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
         doc = WikiPage(
             id=data["id"],
             title=title,
             ctime=ctime,
 class WikiPageLoader(BaseLoader):
+    def __init__(self, inputfile: Path):
         self.inputfile = inputfile
     def lazy_load(self) -> Iterator[Document]:
+        for doc, text in get_contents(self.inputfile):
             metadata = asdict(doc)
             yield Document(page_content=text, metadata=metadata)

models.py CHANGED Viewed

@@ -3,13 +3,13 @@ import dataclasses
 @dataclasses.dataclass(frozen=True)
 class BaseModel:
-    index: str
     id: int
     title: str
     ctime: int
     user: str
     url: str
     type: str
 @dataclasses.dataclass(frozen=True)

 @dataclasses.dataclass(frozen=True)
 class BaseModel:
     id: int
     title: str
     ctime: int
     user: str
     url: str
     type: str
+    index: str = ""
 @dataclasses.dataclass(frozen=True)

store.py CHANGED Viewed

@@ -61,6 +61,12 @@ def get_parser():
     return p
 def main():
     """
     $ python store.py --loader wikipage "index" "FILE_PATH"
@@ -71,12 +77,11 @@ def main():
     args = p.parse_args()
     loader = get_loader(
         args.loader,
-        index=args.index,
         inputfile=Path(args.inputfile),
     )
-    docs = loader.load()
-    texts = get_text_chunk(docs)
     store(texts)

     return p
+def index_annotated_docs(docs, index):
+    for doc in docs:
+        doc.metadata["index"] = index
+        yield doc
 def main():
     """
     $ python store.py --loader wikipage "index" "FILE_PATH"
     args = p.parse_args()
     loader = get_loader(
         args.loader,
         inputfile=Path(args.inputfile),
     )
+    docs = loader.lazy_load()
+    texts = get_text_chunk(index_annotated_docs(docs, args.index))
     store(texts)