Spaces:
Sleeping
Sleeping
Commit
·
61deef0
1
Parent(s):
370837e
all data views in
Browse files
web.py
CHANGED
|
@@ -9,7 +9,7 @@ from data.url_blocklist import urls_high_matches, urls_false_positives
|
|
| 9 |
from data.non_web_urls import non_web_urls
|
| 10 |
|
| 11 |
|
| 12 |
-
def
|
| 13 |
left,
|
| 14 |
header,
|
| 15 |
):
|
|
@@ -28,7 +28,7 @@ def view_data_static(
|
|
| 28 |
return Div(H3(header), data_display, style="margin-top: 10px;")
|
| 29 |
|
| 30 |
|
| 31 |
-
def
|
| 32 |
left_file,
|
| 33 |
doc_id,
|
| 34 |
header,
|
|
@@ -79,7 +79,7 @@ def view_data(
|
|
| 79 |
return Div(form, data_display, style="margin-top: 10px;", id=target)
|
| 80 |
|
| 81 |
|
| 82 |
-
def
|
| 83 |
left_file,
|
| 84 |
right_file,
|
| 85 |
doc_id,
|
|
@@ -149,7 +149,7 @@ def update(target: str, request):
|
|
| 149 |
right_file = params.get("right_file")
|
| 150 |
if left_file and right_file:
|
| 151 |
return (
|
| 152 |
-
|
| 153 |
left_file,
|
| 154 |
right_file,
|
| 155 |
doc_id,
|
|
@@ -157,7 +157,7 @@ def update(target: str, request):
|
|
| 157 |
),
|
| 158 |
)
|
| 159 |
else:
|
| 160 |
-
return
|
| 161 |
left_file,
|
| 162 |
doc_id,
|
| 163 |
params.get("header"),
|
|
@@ -206,18 +206,18 @@ def web_data():
|
|
| 206 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
| 207 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
| 208 |
"""),
|
| 209 |
-
|
| 210 |
H4("1.2 Language Identification"),
|
| 211 |
P("""
|
| 212 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
| 213 |
This step removes over 60% of the whole data.
|
| 214 |
"""),
|
| 215 |
-
|
| 216 |
"data/sample_non_en.json",
|
| 217 |
3,
|
| 218 |
"Sample documents that are classified as non-English",
|
| 219 |
),
|
| 220 |
-
|
| 221 |
"data/sample_en_low.json",
|
| 222 |
3,
|
| 223 |
"Sample documents that are classified as English but with score less than 0.65",
|
|
@@ -233,14 +233,12 @@ def web_data():
|
|
| 233 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
| 234 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
| 235 |
"""),
|
| 236 |
-
|
| 237 |
P("""
|
| 238 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
| 239 |
"""),
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
),
|
| 243 |
-
view_data(
|
| 244 |
"data/bad_url_doc.jsonl",
|
| 245 |
3,
|
| 246 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
|
@@ -249,11 +247,11 @@ def web_data():
|
|
| 249 |
P("""
|
| 250 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
| 251 |
"""),
|
| 252 |
-
|
| 253 |
non_web_urls,
|
| 254 |
"curated url domains that are excluded from our dataset",
|
| 255 |
),
|
| 256 |
-
|
| 257 |
"data/sample_url_exclusion.json",
|
| 258 |
0,
|
| 259 |
"Sample documents whose urls are in our curated url domain list",
|
|
@@ -272,7 +270,7 @@ def web_data():
|
|
| 272 |
of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
|
| 273 |
documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
|
| 274 |
"""),
|
| 275 |
-
|
| 276 |
"data/sample_terminal_punc.json",
|
| 277 |
0,
|
| 278 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
|
@@ -285,7 +283,7 @@ def web_data():
|
|
| 285 |
propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
|
| 286 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
| 287 |
"""),
|
| 288 |
-
|
| 289 |
"data/sample_java.jsonl",
|
| 290 |
0,
|
| 291 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
|
@@ -298,7 +296,7 @@ def web_data():
|
|
| 298 |
- The line matches the pattern “r'^\\d+\\s+likes$'”,
|
| 299 |
- The line contains only one word.
|
| 300 |
"""),
|
| 301 |
-
|
| 302 |
"data/sample_refinedweb_line.json",
|
| 303 |
0,
|
| 304 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
|
@@ -311,7 +309,7 @@ def web_data():
|
|
| 311 |
line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
|
| 312 |
the bad words from English but also consider the bad words from other languages.
|
| 313 |
"""),
|
| 314 |
-
|
| 315 |
json.load(open("data/toxic_lines.json")),
|
| 316 |
"Sample documents with toxic lines",
|
| 317 |
),
|
|
@@ -319,7 +317,7 @@ def web_data():
|
|
| 319 |
P("""
|
| 320 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
| 321 |
Overview of all the quality signals that are used for filtering."""),
|
| 322 |
-
|
| 323 |
json.load(open("data/all_signals.json")),
|
| 324 |
"Overview of all the quality signals that are used for filtering",
|
| 325 |
),
|
|
@@ -368,9 +366,10 @@ def web_data():
|
|
| 368 |
ensures consistency with the overall document character count calculation.
|
| 369 |
"""),
|
| 370 |
H5("Our Implementation"),
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
| 374 |
),
|
| 375 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
| 376 |
P("""
|
|
@@ -394,9 +393,10 @@ def web_data():
|
|
| 394 |
only once — tend to be short.
|
| 395 |
"""),
|
| 396 |
H5("Our Implementations"),
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
| 400 |
),
|
| 401 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
| 402 |
P("""
|
|
@@ -423,18 +423,15 @@ def web_data():
|
|
| 423 |
We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
|
| 424 |
"""),
|
| 425 |
H5("Our Implementations"),
|
| 426 |
-
Img(
|
| 427 |
-
src="path/to/sample_dup_ngrams.png",
|
| 428 |
-
alt="Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
| 429 |
-
),
|
| 430 |
H5("An Example to Show the Difference Between Above Implementations"),
|
| 431 |
P("..."), # Add specific examples if available
|
| 432 |
H5(
|
| 433 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
| 434 |
),
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
| 438 |
),
|
| 439 |
H4("3.2 Line-wise Heuristics"),
|
| 440 |
P("""
|
|
@@ -443,9 +440,10 @@ def web_data():
|
|
| 443 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
| 444 |
90% of lines start with a bullet point.
|
| 445 |
"""),
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
| 449 |
),
|
| 450 |
H4("3.3 Statistics-based Heuristics"),
|
| 451 |
P("""
|
|
@@ -505,10 +503,6 @@ median_word_length = median(len(word) for word in words)
|
|
| 505 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
| 506 |
to split text into sentences.
|
| 507 |
"""),
|
| 508 |
-
Img(
|
| 509 |
-
src="path/to/sample_sentences_split.png",
|
| 510 |
-
alt="Sample documents split into sentences",
|
| 511 |
-
),
|
| 512 |
P("""
|
| 513 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
| 514 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
|
@@ -522,10 +516,6 @@ median_word_length = median(len(word) for word in words)
|
|
| 522 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
| 523 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
| 524 |
"""),
|
| 525 |
-
Img(
|
| 526 |
-
src="path/to/sample_symbol_word_ratio.png",
|
| 527 |
-
alt="Sample documents filtered by symbol-to-word ratio",
|
| 528 |
-
),
|
| 529 |
H5("Fraction of Alphabetic Words"),
|
| 530 |
P("""
|
| 531 |
Implementations from Dolma
|
|
@@ -549,19 +539,17 @@ median_word_length = median(len(word) for word in words)
|
|
| 549 |
alt="Sample documents filtered by number of stop words",
|
| 550 |
),
|
| 551 |
H5("Our Implementations"),
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
| 555 |
),
|
| 556 |
H4("3.4 Others"),
|
| 557 |
P("""
|
| 558 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
| 559 |
text.
|
| 560 |
"""),
|
| 561 |
-
|
| 562 |
-
src="path/to/sample_lorem_ipsum.png",
|
| 563 |
-
alt="Sample documents containing 'lorem ipsum'",
|
| 564 |
-
),
|
| 565 |
H3("4. Deduplication"),
|
| 566 |
P("..."), # Add detailed content and images as needed
|
| 567 |
H3("5. PII Removal"),
|
|
|
|
| 9 |
from data.non_web_urls import non_web_urls
|
| 10 |
|
| 11 |
|
| 12 |
+
def DVS(
|
| 13 |
left,
|
| 14 |
header,
|
| 15 |
):
|
|
|
|
| 28 |
return Div(H3(header), data_display, style="margin-top: 10px;")
|
| 29 |
|
| 30 |
|
| 31 |
+
def DV(
|
| 32 |
left_file,
|
| 33 |
doc_id,
|
| 34 |
header,
|
|
|
|
| 79 |
return Div(form, data_display, style="margin-top: 10px;", id=target)
|
| 80 |
|
| 81 |
|
| 82 |
+
def DV2(
|
| 83 |
left_file,
|
| 84 |
right_file,
|
| 85 |
doc_id,
|
|
|
|
| 149 |
right_file = params.get("right_file")
|
| 150 |
if left_file and right_file:
|
| 151 |
return (
|
| 152 |
+
DV2(
|
| 153 |
left_file,
|
| 154 |
right_file,
|
| 155 |
doc_id,
|
|
|
|
| 157 |
),
|
| 158 |
)
|
| 159 |
else:
|
| 160 |
+
return DV(
|
| 161 |
left_file,
|
| 162 |
doc_id,
|
| 163 |
params.get("header"),
|
|
|
|
| 206 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
| 207 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
| 208 |
"""),
|
| 209 |
+
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
| 210 |
H4("1.2 Language Identification"),
|
| 211 |
P("""
|
| 212 |
After text extraction, the non-English texts are then filtered out by fastText language identifier with a threshold of 0.65.
|
| 213 |
This step removes over 60% of the whole data.
|
| 214 |
"""),
|
| 215 |
+
DV(
|
| 216 |
"data/sample_non_en.json",
|
| 217 |
3,
|
| 218 |
"Sample documents that are classified as non-English",
|
| 219 |
),
|
| 220 |
+
DV(
|
| 221 |
"data/sample_en_low.json",
|
| 222 |
3,
|
| 223 |
"Sample documents that are classified as English but with score less than 0.65",
|
|
|
|
| 233 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
| 234 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
| 235 |
"""),
|
| 236 |
+
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
| 237 |
P("""
|
| 238 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
| 239 |
"""),
|
| 240 |
+
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
| 241 |
+
DV(
|
|
|
|
|
|
|
| 242 |
"data/bad_url_doc.jsonl",
|
| 243 |
3,
|
| 244 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
|
|
|
| 247 |
P("""
|
| 248 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
| 249 |
"""),
|
| 250 |
+
DVS(
|
| 251 |
non_web_urls,
|
| 252 |
"curated url domains that are excluded from our dataset",
|
| 253 |
),
|
| 254 |
+
DV(
|
| 255 |
"data/sample_url_exclusion.json",
|
| 256 |
0,
|
| 257 |
"Sample documents whose urls are in our curated url domain list",
|
|
|
|
| 270 |
of 56,292 additional lines, resulting in the complete exclusion of 2,203 documents from a total of 13,560
|
| 271 |
documents (16.25%). Accordingly, we choose to not use terminal punctuation as a signal to remove lines.
|
| 272 |
"""),
|
| 273 |
+
DV(
|
| 274 |
"data/sample_terminal_punc.json",
|
| 275 |
0,
|
| 276 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
|
|
|
| 283 |
propose to refine the strategy by adding one more keyword to the word "javascript" to avoid false positives.
|
| 284 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
| 285 |
"""),
|
| 286 |
+
DV(
|
| 287 |
"data/sample_java.jsonl",
|
| 288 |
0,
|
| 289 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
|
|
|
| 296 |
- The line matches the pattern “r'^\\d+\\s+likes$'”,
|
| 297 |
- The line contains only one word.
|
| 298 |
"""),
|
| 299 |
+
DV(
|
| 300 |
"data/sample_refinedweb_line.json",
|
| 301 |
0,
|
| 302 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
|
|
|
| 309 |
line is in the first 3 lines or in the last 3 lines) to remove toxic lines. Specifically, we do not only consider
|
| 310 |
the bad words from English but also consider the bad words from other languages.
|
| 311 |
"""),
|
| 312 |
+
DVS(
|
| 313 |
json.load(open("data/toxic_lines.json")),
|
| 314 |
"Sample documents with toxic lines",
|
| 315 |
),
|
|
|
|
| 317 |
P("""
|
| 318 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
| 319 |
Overview of all the quality signals that are used for filtering."""),
|
| 320 |
+
DVS(
|
| 321 |
json.load(open("data/all_signals.json")),
|
| 322 |
"Overview of all the quality signals that are used for filtering",
|
| 323 |
),
|
|
|
|
| 366 |
ensures consistency with the overall document character count calculation.
|
| 367 |
"""),
|
| 368 |
H5("Our Implementation"),
|
| 369 |
+
DV(
|
| 370 |
+
"data/repeat_line_frac.jsonl",
|
| 371 |
+
0,
|
| 372 |
+
"Sample documents filtered by excessive line repetitions / characters in repeated lines",
|
| 373 |
),
|
| 374 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
| 375 |
P("""
|
|
|
|
| 393 |
only once — tend to be short.
|
| 394 |
"""),
|
| 395 |
H5("Our Implementations"),
|
| 396 |
+
DV(
|
| 397 |
+
"data/sample_top_ngram.json",
|
| 398 |
+
0,
|
| 399 |
+
"Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
|
| 400 |
),
|
| 401 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
| 402 |
P("""
|
|
|
|
| 423 |
We decided to use the RedPajama V2 implementation but skip the 1st occurrence of the duplicate n-gram.
|
| 424 |
"""),
|
| 425 |
H5("Our Implementations"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
H5("An Example to Show the Difference Between Above Implementations"),
|
| 427 |
P("..."), # Add specific examples if available
|
| 428 |
H5(
|
| 429 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
| 430 |
),
|
| 431 |
+
DV(
|
| 432 |
+
"data/sample_dup_ngram.json",
|
| 433 |
+
0,
|
| 434 |
+
"Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
| 435 |
),
|
| 436 |
H4("3.2 Line-wise Heuristics"),
|
| 437 |
P("""
|
|
|
|
| 440 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
| 441 |
90% of lines start with a bullet point.
|
| 442 |
"""),
|
| 443 |
+
DV(
|
| 444 |
+
"data/line_info.json",
|
| 445 |
+
0,
|
| 446 |
+
"Sample documents that are filtered out by line-wise heuristics",
|
| 447 |
),
|
| 448 |
H4("3.3 Statistics-based Heuristics"),
|
| 449 |
P("""
|
|
|
|
| 503 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
| 504 |
to split text into sentences.
|
| 505 |
"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
P("""
|
| 507 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
| 508 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
|
|
|
| 516 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
| 517 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
| 518 |
"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
H5("Fraction of Alphabetic Words"),
|
| 520 |
P("""
|
| 521 |
Implementations from Dolma
|
|
|
|
| 539 |
alt="Sample documents filtered by number of stop words",
|
| 540 |
),
|
| 541 |
H5("Our Implementations"),
|
| 542 |
+
DV(
|
| 543 |
+
"data/sample_doc_stat.json",
|
| 544 |
+
0,
|
| 545 |
+
"Sample documents that are filtered out by statistics-based heuristics",
|
| 546 |
),
|
| 547 |
H4("3.4 Others"),
|
| 548 |
P("""
|
| 549 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
| 550 |
text.
|
| 551 |
"""),
|
| 552 |
+
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum"),
|
|
|
|
|
|
|
|
|
|
| 553 |
H3("4. Deduplication"),
|
| 554 |
P("..."), # Add detailed content and images as needed
|
| 555 |
H3("5. PII Removal"),
|