victormiller
commited on
Commit
•
8a16e84
1
Parent(s):
0bc171c
Update curated.py
Browse files- curated.py +201 -24
curated.py
CHANGED
@@ -571,6 +571,183 @@ phil_examples = Div(
|
|
571 |
),
|
572 |
)
|
573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
filtering_process = Div(
|
575 |
Section(
|
576 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
@@ -605,10 +782,10 @@ filtering_process = Div(
|
|
605 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
606 |
),
|
607 |
table_div_arx,
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
),
|
613 |
),
|
614 |
Section(
|
@@ -647,10 +824,10 @@ filtering_process = Div(
|
|
647 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
648 |
),
|
649 |
table_div_s2o,
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
),
|
655 |
),
|
656 |
Section(
|
@@ -683,10 +860,10 @@ filtering_process = Div(
|
|
683 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
684 |
),
|
685 |
table_div_med,
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
),
|
691 |
),
|
692 |
Section(
|
@@ -715,10 +892,10 @@ filtering_process = Div(
|
|
715 |
H4("Filtering"),
|
716 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
717 |
table_div_up,
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
),
|
723 |
),
|
724 |
Section(
|
@@ -860,10 +1037,10 @@ filtering_process = Div(
|
|
860 |
Li("None"),
|
861 |
),
|
862 |
table_div_dmm,
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
),
|
868 |
),
|
869 |
Section(
|
@@ -881,10 +1058,10 @@ filtering_process = Div(
|
|
881 |
Li("Unigram Log Probability"),
|
882 |
),
|
883 |
table_div_pg19,
|
884 |
-
|
885 |
-
|
886 |
-
|
887 |
-
|
888 |
),
|
889 |
),
|
890 |
)
|
|
|
571 |
),
|
572 |
)
|
573 |
|
574 |
+
arx_examples = Div(
|
575 |
+
Div(
|
576 |
+
get_arx_data(target=gen_random_id()),
|
577 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
578 |
+
),
|
579 |
+
)
|
580 |
+
|
581 |
+
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
582 |
+
doc_id = max(0, min(int(doc_id), 9))
|
583 |
+
|
584 |
+
if data_source == "S2ORC":
|
585 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
586 |
+
open("data/curated_samples/s2orc_raw.json")
|
587 |
+
)
|
588 |
+
else:
|
589 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
590 |
+
|
591 |
+
raw_json = raw_sample_doc[doc_id]
|
592 |
+
extracted_json = extracted_sample_doc[doc_id]
|
593 |
+
return view_data(
|
594 |
+
raw_json,
|
595 |
+
extracted_json,
|
596 |
+
doc_id=doc_id,
|
597 |
+
data_source="S2ORC",
|
598 |
+
data_sources="S2ORC",
|
599 |
+
target=target,
|
600 |
+
)
|
601 |
+
|
602 |
+
s2o_examples = Div(
|
603 |
+
Div(
|
604 |
+
get_S2ORC_data(target=gen_random_id()),
|
605 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
606 |
+
),
|
607 |
+
)
|
608 |
+
|
609 |
+
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
610 |
+
doc_id = max(0, min(int(doc_id), 9))
|
611 |
+
|
612 |
+
if data_source == "S2ORC":
|
613 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
614 |
+
open("data/curated_samples/s2orc_abstract_raw.json")
|
615 |
+
)
|
616 |
+
else:
|
617 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
618 |
+
|
619 |
+
raw_json = raw_sample_doc[doc_id]
|
620 |
+
extracted_json = extracted_sample_doc[doc_id]
|
621 |
+
return view_data(
|
622 |
+
raw_json,
|
623 |
+
extracted_json,
|
624 |
+
doc_id=doc_id,
|
625 |
+
data_source="S2ORC Abstract",
|
626 |
+
data_sources="S2ORC Abstract",
|
627 |
+
target=target,
|
628 |
+
)
|
629 |
+
|
630 |
+
s2oa_examples = Div(
|
631 |
+
Div(
|
632 |
+
get_S2ORCA_data(target=gen_random_id()),
|
633 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
634 |
+
),
|
635 |
+
)
|
636 |
+
|
637 |
+
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
638 |
+
doc_id = max(0, min(int(doc_id), 9))
|
639 |
+
|
640 |
+
if data_source == "Pubmed":
|
641 |
+
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
642 |
+
extracted_sample_doc = json.load(
|
643 |
+
open("data/curated_samples/pubmed_extract.json")
|
644 |
+
)
|
645 |
+
else:
|
646 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
647 |
+
|
648 |
+
raw_json = raw_sample_doc[doc_id]
|
649 |
+
extracted_json = extracted_sample_doc[doc_id]
|
650 |
+
return view_data(
|
651 |
+
raw_json,
|
652 |
+
extracted_json,
|
653 |
+
doc_id=doc_id,
|
654 |
+
data_source="Pubmed",
|
655 |
+
data_sources="Pubmed",
|
656 |
+
target=target,
|
657 |
+
)
|
658 |
+
|
659 |
+
pubmed_examples = Div(
|
660 |
+
Div(
|
661 |
+
get_pubmed_data(target=gen_random_id()),
|
662 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
663 |
+
),
|
664 |
+
)
|
665 |
+
|
666 |
+
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
667 |
+
doc_id = max(0, min(int(doc_id), 9))
|
668 |
+
|
669 |
+
if data_source == "DM Math":
|
670 |
+
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
671 |
+
extracted_sample_doc = json.load(
|
672 |
+
open("data/curated_samples/dm_maths_extract.json")
|
673 |
+
)
|
674 |
+
else:
|
675 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
676 |
+
|
677 |
+
raw_json = raw_sample_doc[doc_id]
|
678 |
+
extracted_json = extracted_sample_doc[doc_id]
|
679 |
+
return view_data(
|
680 |
+
raw_json,
|
681 |
+
extracted_json,
|
682 |
+
doc_id=doc_id,
|
683 |
+
data_source="DM Math",
|
684 |
+
data_sources="DM Math",
|
685 |
+
target=target,
|
686 |
+
)
|
687 |
+
|
688 |
+
dmm_examples = Div(
|
689 |
+
Div(
|
690 |
+
get_dmm_data(target=gen_random_id()),
|
691 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
692 |
+
),
|
693 |
+
)
|
694 |
+
|
695 |
+
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
696 |
+
doc_id = max(0, min(int(doc_id), 9))
|
697 |
+
|
698 |
+
if data_source == "PG19":
|
699 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
700 |
+
open("data/curated_samples/pg19_raw.json")
|
701 |
+
)
|
702 |
+
else:
|
703 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
704 |
+
|
705 |
+
raw_json = raw_sample_doc[doc_id]
|
706 |
+
extracted_json = extracted_sample_doc[doc_id]
|
707 |
+
return view_data(
|
708 |
+
raw_json,
|
709 |
+
extracted_json,
|
710 |
+
doc_id=doc_id,
|
711 |
+
data_source="PG19",
|
712 |
+
data_sources="PG19",
|
713 |
+
target=target,
|
714 |
+
)
|
715 |
+
|
716 |
+
pg19_examples = Div(
|
717 |
+
Div(
|
718 |
+
get_pg19_data(target=gen_random_id()),
|
719 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
720 |
+
),
|
721 |
+
)
|
722 |
+
|
723 |
+
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
724 |
+
doc_id = max(0, min(int(doc_id), 9))
|
725 |
+
|
726 |
+
if data_source == "Europarl":
|
727 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
728 |
+
open("data/curated_samples/europarl_raw.json")
|
729 |
+
)
|
730 |
+
else:
|
731 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
732 |
+
|
733 |
+
raw_json = raw_sample_doc[doc_id]
|
734 |
+
extracted_json = extracted_sample_doc[doc_id]
|
735 |
+
return view_data(
|
736 |
+
raw_json,
|
737 |
+
extracted_json,
|
738 |
+
doc_id=doc_id,
|
739 |
+
data_source="Europarl",
|
740 |
+
data_sources="Europarl",
|
741 |
+
target=target,
|
742 |
+
)
|
743 |
+
|
744 |
+
eu_examples = Div(
|
745 |
+
Div(
|
746 |
+
get_eu_data(target=gen_random_id()),
|
747 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
748 |
+
),
|
749 |
+
)
|
750 |
+
|
751 |
filtering_process = Div(
|
752 |
Section(
|
753 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
|
782 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
783 |
),
|
784 |
table_div_arx,
|
785 |
+
Details(
|
786 |
+
Summary("ArXiv Filtering Examples"),
|
787 |
+
arx_examples,
|
788 |
+
),
|
789 |
),
|
790 |
),
|
791 |
Section(
|
|
|
824 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
825 |
),
|
826 |
table_div_s2o,
|
827 |
+
Details(
|
828 |
+
Summary("FreeLaw Filtering Examples -- need to update"),
|
829 |
+
freelaw_examples,
|
830 |
+
),
|
831 |
),
|
832 |
),
|
833 |
Section(
|
|
|
860 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
861 |
),
|
862 |
table_div_med,
|
863 |
+
Details(
|
864 |
+
Summary("PubMed Filtering Examples"),
|
865 |
+
pubmed_examples,
|
866 |
+
),
|
867 |
),
|
868 |
),
|
869 |
Section(
|
|
|
892 |
H4("Filtering"),
|
893 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
894 |
table_div_up,
|
895 |
+
Details(
|
896 |
+
Summary("EuroParl Filtering Examples"),
|
897 |
+
eu_examples,
|
898 |
+
),
|
899 |
),
|
900 |
),
|
901 |
Section(
|
|
|
1037 |
Li("None"),
|
1038 |
),
|
1039 |
table_div_dmm,
|
1040 |
+
Details(
|
1041 |
+
Summary("DM Math Filtering Examples"),
|
1042 |
+
dmm_examples,
|
1043 |
+
),
|
1044 |
),
|
1045 |
),
|
1046 |
Section(
|
|
|
1058 |
Li("Unigram Log Probability"),
|
1059 |
),
|
1060 |
table_div_pg19,
|
1061 |
+
Details(
|
1062 |
+
Summary("PG-19 Filtering Examples"),
|
1063 |
+
pg19_examples,
|
1064 |
+
),
|
1065 |
),
|
1066 |
),
|
1067 |
)
|