Spaces:
Running
Running
Commit
·
8d5b271
1
Parent(s):
c1dc2ee
refactoring: move index annotation
Browse files- loaders/github_issue.py +3 -6
- loaders/rtdhtmlpage.py +1 -3
- loaders/wikipage.py +3 -5
- models.py +1 -1
- store.py +8 -3
loaders/github_issue.py
CHANGED
|
@@ -15,14 +15,13 @@ def date_to_int(dt_str: str) -> int:
|
|
| 15 |
return int(dt.timestamp())
|
| 16 |
|
| 17 |
|
| 18 |
-
def get_contents(
|
| 19 |
with inputfile.open("r") as f:
|
| 20 |
obj = [json.loads(line) for line in f]
|
| 21 |
for data in obj:
|
| 22 |
title = data["title"]
|
| 23 |
body = data["body"]
|
| 24 |
issue = GithubIssue(
|
| 25 |
-
index=index,
|
| 26 |
id=data["number"],
|
| 27 |
title=title,
|
| 28 |
ctime=date_to_int(data["created_at"]),
|
|
@@ -37,7 +36,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
|
|
| 37 |
comments = data["comments_"]
|
| 38 |
for comment in comments:
|
| 39 |
issue = GithubIssue(
|
| 40 |
-
index=index,
|
| 41 |
id=comment["id"],
|
| 42 |
title=data["title"],
|
| 43 |
ctime=date_to_int(comment["created_at"]),
|
|
@@ -50,12 +48,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
|
|
| 50 |
|
| 51 |
|
| 52 |
class GithubIssueLoader(BaseLoader):
|
| 53 |
-
def __init__(self,
|
| 54 |
-
self.index = index
|
| 55 |
self.inputfile = inputfile
|
| 56 |
|
| 57 |
def lazy_load(self) -> Iterator[Document]:
|
| 58 |
-
for issue, text in get_contents(self.
|
| 59 |
metadata = asdict(issue)
|
| 60 |
yield Document(page_content=text, metadata=metadata)
|
| 61 |
|
|
|
|
| 15 |
return int(dt.timestamp())
|
| 16 |
|
| 17 |
|
| 18 |
+
def get_contents(inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
|
| 19 |
with inputfile.open("r") as f:
|
| 20 |
obj = [json.loads(line) for line in f]
|
| 21 |
for data in obj:
|
| 22 |
title = data["title"]
|
| 23 |
body = data["body"]
|
| 24 |
issue = GithubIssue(
|
|
|
|
| 25 |
id=data["number"],
|
| 26 |
title=title,
|
| 27 |
ctime=date_to_int(data["created_at"]),
|
|
|
|
| 36 |
comments = data["comments_"]
|
| 37 |
for comment in comments:
|
| 38 |
issue = GithubIssue(
|
|
|
|
| 39 |
id=comment["id"],
|
| 40 |
title=data["title"],
|
| 41 |
ctime=date_to_int(comment["created_at"]),
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
class GithubIssueLoader(BaseLoader):
|
| 51 |
+
def __init__(self, inputfile: Path):
|
|
|
|
| 52 |
self.inputfile = inputfile
|
| 53 |
|
| 54 |
def lazy_load(self) -> Iterator[Document]:
|
| 55 |
+
for issue, text in get_contents(self.inputfile):
|
| 56 |
metadata = asdict(issue)
|
| 57 |
yield Document(page_content=text, metadata=metadata)
|
| 58 |
|
loaders/rtdhtmlpage.py
CHANGED
|
@@ -12,8 +12,7 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
|
|
| 12 |
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
|
| 13 |
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
|
| 14 |
"""
|
| 15 |
-
def __init__(self,
|
| 16 |
-
self.index = index
|
| 17 |
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
|
| 18 |
super().__init__(inputfile, *args, **kwargs)
|
| 19 |
|
|
@@ -66,7 +65,6 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
|
|
| 66 |
"user": "rtd",
|
| 67 |
"type": "rtd",
|
| 68 |
"url": f"https://{str(p)}",
|
| 69 |
-
"index": self.index,
|
| 70 |
"id": str(p),
|
| 71 |
}
|
| 72 |
# print(metadata)
|
|
|
|
| 12 |
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
|
| 13 |
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
|
| 14 |
"""
|
| 15 |
+
def __init__(self, inputfile: Path, *args, **kwargs):
|
|
|
|
| 16 |
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
|
| 17 |
super().__init__(inputfile, *args, **kwargs)
|
| 18 |
|
|
|
|
| 65 |
"user": "rtd",
|
| 66 |
"type": "rtd",
|
| 67 |
"url": f"https://{str(p)}",
|
|
|
|
| 68 |
"id": str(p),
|
| 69 |
}
|
| 70 |
# print(metadata)
|
loaders/wikipage.py
CHANGED
|
@@ -15,7 +15,7 @@ def date_to_int(dt_str: str) -> int:
|
|
| 15 |
return int(dt.timestamp())
|
| 16 |
|
| 17 |
|
| 18 |
-
def get_contents(
|
| 19 |
"""filename for file with ndjson
|
| 20 |
|
| 21 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
|
@@ -28,7 +28,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
|
| 28 |
body = data["content"]
|
| 29 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
| 30 |
doc = WikiPage(
|
| 31 |
-
index=index,
|
| 32 |
id=data["id"],
|
| 33 |
title=title,
|
| 34 |
ctime=ctime,
|
|
@@ -42,12 +41,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
|
| 42 |
|
| 43 |
|
| 44 |
class WikiPageLoader(BaseLoader):
|
| 45 |
-
def __init__(self,
|
| 46 |
-
self.index = index
|
| 47 |
self.inputfile = inputfile
|
| 48 |
|
| 49 |
def lazy_load(self) -> Iterator[Document]:
|
| 50 |
-
for doc, text in get_contents(self.
|
| 51 |
metadata = asdict(doc)
|
| 52 |
yield Document(page_content=text, metadata=metadata)
|
| 53 |
|
|
|
|
| 15 |
return int(dt.timestamp())
|
| 16 |
|
| 17 |
|
| 18 |
+
def get_contents(inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
| 19 |
"""filename for file with ndjson
|
| 20 |
|
| 21 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
|
|
|
| 28 |
body = data["content"]
|
| 29 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
| 30 |
doc = WikiPage(
|
|
|
|
| 31 |
id=data["id"],
|
| 32 |
title=title,
|
| 33 |
ctime=ctime,
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
class WikiPageLoader(BaseLoader):
|
| 44 |
+
def __init__(self, inputfile: Path):
|
|
|
|
| 45 |
self.inputfile = inputfile
|
| 46 |
|
| 47 |
def lazy_load(self) -> Iterator[Document]:
|
| 48 |
+
for doc, text in get_contents(self.inputfile):
|
| 49 |
metadata = asdict(doc)
|
| 50 |
yield Document(page_content=text, metadata=metadata)
|
| 51 |
|
models.py
CHANGED
|
@@ -3,13 +3,13 @@ import dataclasses
|
|
| 3 |
|
| 4 |
@dataclasses.dataclass(frozen=True)
|
| 5 |
class BaseModel:
|
| 6 |
-
index: str
|
| 7 |
id: int
|
| 8 |
title: str
|
| 9 |
ctime: int
|
| 10 |
user: str
|
| 11 |
url: str
|
| 12 |
type: str
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
@dataclasses.dataclass(frozen=True)
|
|
|
|
| 3 |
|
| 4 |
@dataclasses.dataclass(frozen=True)
|
| 5 |
class BaseModel:
|
|
|
|
| 6 |
id: int
|
| 7 |
title: str
|
| 8 |
ctime: int
|
| 9 |
user: str
|
| 10 |
url: str
|
| 11 |
type: str
|
| 12 |
+
index: str = ""
|
| 13 |
|
| 14 |
|
| 15 |
@dataclasses.dataclass(frozen=True)
|
store.py
CHANGED
|
@@ -61,6 +61,12 @@ def get_parser():
|
|
| 61 |
return p
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def main():
|
| 65 |
"""
|
| 66 |
$ python store.py --loader wikipage "index" "FILE_PATH"
|
|
@@ -71,12 +77,11 @@ def main():
|
|
| 71 |
args = p.parse_args()
|
| 72 |
loader = get_loader(
|
| 73 |
args.loader,
|
| 74 |
-
index=args.index,
|
| 75 |
inputfile=Path(args.inputfile),
|
| 76 |
)
|
| 77 |
|
| 78 |
-
docs = loader.
|
| 79 |
-
texts = get_text_chunk(docs)
|
| 80 |
store(texts)
|
| 81 |
|
| 82 |
|
|
|
|
| 61 |
return p
|
| 62 |
|
| 63 |
|
| 64 |
+
def index_annotated_docs(docs, index):
|
| 65 |
+
for doc in docs:
|
| 66 |
+
doc.metadata["index"] = index
|
| 67 |
+
yield doc
|
| 68 |
+
|
| 69 |
+
|
| 70 |
def main():
|
| 71 |
"""
|
| 72 |
$ python store.py --loader wikipage "index" "FILE_PATH"
|
|
|
|
| 77 |
args = p.parse_args()
|
| 78 |
loader = get_loader(
|
| 79 |
args.loader,
|
|
|
|
| 80 |
inputfile=Path(args.inputfile),
|
| 81 |
)
|
| 82 |
|
| 83 |
+
docs = loader.lazy_load()
|
| 84 |
+
texts = get_text_chunk(index_annotated_docs(docs, args.index))
|
| 85 |
store(texts)
|
| 86 |
|
| 87 |
|