Spaces:
Sleeping
Sleeping
| from fasthtml.common import * | |
| from fasthtml.components import * | |
| from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline | |
| from fasthtml.components import HR | |
| from plotly import graph_objects as go | |
| from fh_plotly import plotly2fasthtml | |
| import pandas as pd | |
| import json | |
| from rich import print | |
| app, rt = fast_app(debug=True) | |
| def main(): | |
| return Html( | |
| Head( | |
| Meta(charset="UTF-8"), | |
| Meta(name="viewport", content="width=device-width, initial-scale=1.0"), | |
| Script(src="https://distill.pub/template.v2.js"), | |
| Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), | |
| Script(src="https://cdn.plot.ly/plotly-latest.min.js"), | |
| Link(rel="stylesheet", href="style.css"), | |
| ), | |
| Body( | |
| D_title( | |
| H1( | |
| "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models", | |
| cls="l-body", | |
| style="text-align: center;", | |
| ), | |
| Div( | |
| Img(src="images/llm360_logo.png"), | |
| id="title-plot", | |
| cls="main-plot-container l-page", | |
| ), | |
| ), | |
| D_article( | |
| D_contents( | |
| Nav( | |
| H3("Table of Contents"), | |
| Div( | |
| A("TxT360", href="#_self"), | |
| hx_get="/intro", | |
| hx_target="#inner-text", | |
| ), | |
| Div( | |
| Ul( | |
| Li( | |
| A( | |
| "Introduction", | |
| href="/intro#section1", | |
| hx_get="/intro#section1", | |
| hx_target="#inner-text", | |
| ) | |
| ), | |
| Li( | |
| A( | |
| "Background", | |
| href="/intro#section2", | |
| hx_get="/intro#section2", | |
| hx_target="#inner-text", | |
| ) | |
| ), | |
| Li( | |
| A( | |
| "Main Content", | |
| href="/intro#section3", | |
| hx_get="/intro#section3", | |
| hx_target="#inner-text", | |
| ) | |
| ), | |
| Li( | |
| A( | |
| "Conclusion", | |
| href="/intro#section4", | |
| hx_get="/intro#section4", | |
| hx_target="#inner-text", | |
| ) | |
| ), | |
| ), | |
| ), | |
| Div( | |
| A("Web Data", href="#inner-text"), | |
| hx_get="/webdata", | |
| hx_target="#inner-text", | |
| ), | |
| Div( | |
| A("Curated Sources", href="#inner-text"), | |
| hx_get="/curated", | |
| hx_target="#inner-text", | |
| ), | |
| Div( | |
| A("Common Steps", href="#inner-text"), | |
| hx_get="/common", | |
| hx_target="#inner-text", | |
| ), | |
| Div( | |
| A("TxT360 Results", href="#inner-text"), | |
| hx_get="/results", | |
| hx_target="#inner-text", | |
| ), | |
| role="navigation", | |
| cls="l-text figcaption", | |
| ), | |
| prerendered="true", | |
| ), | |
| intro(), | |
| ), | |
| ), | |
| lang="en", | |
| ) | |
| def intro(): | |
| return Div( | |
| Section( | |
| H2("Introduction"), | |
| P("""We are excited to introduce TxT360, a | |
| large-scale, comprehensive, and fully transparent | |
| dataset designed for Large Language Model (LLM) | |
| pre-training. TxT360 is engineered to strike a | |
| balance between the quantity and quality of | |
| pre-training data, pushing the limit on both | |
| fronts. This comprehensive dataset encompasses both | |
| expansive web-based data and highly curated data | |
| sources, making it one of the most robust LLM | |
| pre-training corpora available today. Our web data | |
| component includes 99 snapshots from Common Crawl, | |
| amassing 5.7 trillion tokens and occupying 11 TB of | |
| disk space in jsonl.gz format. On the curated side, | |
| TxT360 integrates one of the most extensive | |
| collections of high-quality sources across multiple | |
| domains, ensuring diverse and rich content referred | |
| to as curated sources, 14 sources across 10 | |
| domains. To maintain the highest quality, we | |
| meticulously pre-processed the web data to filter | |
| out low-quality content and conducted thorough | |
| reviews of the curated sources. This process not | |
| only unified their formats but also identified and | |
| rectified any anomalies. Not only do we 100% | |
| open-source our processing scripts, but we also | |
| release the details of our data reviews, revealing | |
| the decision-making processes behind data selection | |
| and quality assurance. This level of transparency | |
| allows researchers and practitioners to fully | |
| understand the dataset’s composition and make | |
| informed decisions when using TxT360 for training. | |
| Additionally, TxT360 includes detailed | |
| documentation and analysis of the data, covering | |
| distribution statistics, domain coverage, and | |
| processing pipeline, which helps users navigate and | |
| utilize the dataset effectively. Overall, TxT360 | |
| represents a significant step forward in the | |
| availability and transparency of large-scale | |
| training data for language models, setting a new | |
| standard for dataset quality and openness."""), | |
| id="section1", | |
| ), | |
| Section( | |
| H2("Background"), | |
| P( | |
| """ The quality and size of a pre-training dataset | |
| play a crucial role in the performance of large | |
| language models (LLMs). The community has | |
| introduced a variety of datasets for this purpose, | |
| including purely web-based datasets like RefinedWeb | |
| [1], RedPajama-Data-V2 [2], DCLM [3], and | |
| FineWeb [4], as well as comprehensive datasets | |
| derived from multiple highly-curated data sources | |
| such as The Pile [5], RedPajama-Data-V1 [6], and | |
| Dolma [7] . It is commonly known that web-based | |
| datasets provide a vast quantity of data, while | |
| highly-curated multi-source datasets consistently | |
| deliver high quality and diversity, both critical | |
| for effective LLM pre-training. However, despite | |
| the advancements in both types of data, each type | |
| of dataset has its limitations. For instance, the | |
| processing scripts for the web dataset, RefinedWeb, | |
| known for its high quality, are not public, and | |
| only about 10% of the entire dataset has been | |
| disclosed. Conversely, the web component of | |
| existing highly-curated multi-source datasets is | |
| relatively small compared to purely web-based | |
| datasets, limiting their coverage and diversity | |
| compared to the scale of information from the | |
| internet. By integrating the extensive reach of | |
| web data with the exceptional quality of curated | |
| sources, TxT360 is crafted to meet and surpass the | |
| rigorous standards required for state-of-the-art | |
| LLM pre-training. """ | |
| ), | |
| id="section2", | |
| ), | |
| Section( | |
| H2("Main Content"), | |
| P("""The performance of a large language model (LLM) | |
| depends heavily on the quality and size of its | |
| pretraining dataset. However, the pretraining | |
| datasets for state-of-the-art open LLMs like Llama | |
| 3 and Mixtral are not publicly available and very | |
| little is known about how they were created. | |
| Reading time: 45 min. For the best reading | |
| experience, we recommend not using a mobile phone. | |
| Recently, we released 🍷 FineWeb, a new, | |
| large-scale (15-trillion tokens, 44TB disk space) | |
| dataset for LLM pretraining. FineWeb is derived | |
| from 96 CommonCrawl snapshots and produces | |
| better-performing LLMs than other open pretraining | |
| datasets. To bring more clarity in machine learning | |
| and advance the open understanding of how to train | |
| good quality large language models, we carefully | |
| documented and ablated all of the design choices | |
| used in FineWeb, including in-depth investigations | |
| of deduplication and filtering strategies. The | |
| present long form report is a deep dive in how to | |
| create a large and high-quality web-scale dataset | |
| for LLM pretraining. The dataset itself, 🍷 | |
| FineWeb, is available here. We are extremely | |
| thankful to the whole distill.pub team (Christopher | |
| Olah, Shan Carter, Ludwig Schubert in particular) | |
| for creating the template on which we based this | |
| blog post. Thanks also for inspiring us with | |
| exquisitely crafted articles and blog posts. In | |
| this report we also introduce 📚 FineWeb-Edu, a | |
| subset of FineWeb constructed using scalable | |
| automated high-quality annotations for educational | |
| value, and which outperforms all openly accessible | |
| web-datasets on a number of educational benchmarks | |
| such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu | |
| is available in two sizes/filtering-level: 1.3 | |
| trillion (very high educational content) and 5.4 | |
| trillion (high educational content) tokens (all | |
| tokens are measured with GPT2 tokenizer). You can | |
| download it here. Both datasets are released under | |
| the permissive ODC-By 1.0 license TLDR: This blog | |
| covers a discussion on processing and evaluating | |
| data quality at scale, the 🍷 FineWeb recipe | |
| (listing and explaining all of our design choices), | |
| and the process followed to create its 📚 | |
| FineWeb-Edu subset."""), | |
| id="section3", | |
| ), | |
| Section( | |
| H2("Conclusion"), | |
| P("""This is the conclusion section where we | |
| summarize the key points discussed in the blog post | |
| and provide final thoughts."""), | |
| id="section4", | |
| ), | |
| id="inner-text", | |
| ) | |
| def web_data(): | |
| return Div(Section(H2(P("Web Data")), id="inner-text")) | |
| def get_chart_28168342(): | |
| fig = go.Figure() | |
| filter_names = [ | |
| "Download", | |
| "Language", | |
| "Min word count", | |
| "Title Abstract", | |
| "Majority language", | |
| "Paragraph count", | |
| "Frequency", | |
| "Unigram log probability", | |
| "Local dedup", | |
| ] | |
| data_sources = [ | |
| ("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]), | |
| ("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]), | |
| ("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ] | |
| for name, x_values in data_sources: | |
| fig.add_trace( | |
| go.Funnel( | |
| name=name, | |
| orientation="h", | |
| y=filter_names, | |
| x=x_values, | |
| textinfo="value+percent total", | |
| textposition="inside", | |
| ) | |
| ) | |
| fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)") | |
| return fig | |
| def curated(request): | |
| from curated import get_data | |
| # Partial Updates | |
| params = request.query_params | |
| if data_source := params.get("data_source"): | |
| return get_data(data_source, params.get("doc_id", 3)) | |
| if doc_id := params.get("doc_id"): | |
| return get_data(params.get("data_source"), doc_id) | |
| hr = HR() | |
| data_preparation_steps = pd.DataFrame( | |
| { | |
| "Method": [ | |
| "HTTP/FTP dumps", | |
| "Web crawling", | |
| "Archive snapshot", | |
| "Generated", | |
| "Curated", | |
| ], | |
| "Description": [ | |
| "Acquiring data from HTTP/FTP dumps", | |
| "Crawling websites to extract data", | |
| "Working with archive dumps", | |
| "Generating synthetic data", | |
| "High quality curated data", | |
| ], | |
| "Source": [ | |
| "Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds", | |
| "USPTO | Hackernews | Ubuntu IRC", | |
| "StackExchange", | |
| "DM Maths", | |
| "PG19 | Europarl", | |
| ], | |
| } | |
| ) | |
| table_html = data_preparation_steps.to_html(index=False, border=0) | |
| table_div = Div(NotStr(table_html), style="margin: 40px;") | |
| text = P("""This initial stage serves as the foundation for the entire | |
| process. Here, we focus on acquiring and extracting the raw data, which can | |
| come from various sources such as crawling websites, using HTTP/FTP dumps, | |
| or working with archive dumps. For instance, to download and prepare a | |
| dataset, we can specific downloaders based on the data source. Each dataset | |
| might have its own downloader script which can be updated in real time to | |
| handle changes in the data source. Here is a general outline of the data | |
| preparation process: It's worth noting that some pipelines might require | |
| invoking additional functions or scripts to handle specific data sources or | |
| formats. These helper scripts can be located within specific directories | |
| or modules dedicated to the dataset.""") | |
| data_preparation_div = Div( | |
| H3("Data Preparation"), | |
| text, | |
| table_div, | |
| Div(get_data(), style="border: 1px solid #ccc; padding: 20px;"), | |
| ) | |
| text = P("""Data preprocessing is a crucial step in the data science | |
| pipeline. It involves cleaning and transforming raw data into a format that | |
| is suitable for analysis. This process includes handling missing values, | |
| normalizing data, encoding categorical variables, and more.""") | |
| preprocessing_steps = pd.DataFrame( | |
| { | |
| "Step": [ | |
| "Language Filter", | |
| "Min Word Count", | |
| "Title Abstract", | |
| "Majority Language", | |
| "Paragraph Count", | |
| "Frequency", | |
| "Unigram Log Probability", | |
| ], | |
| "Description": [ | |
| "Filtering data based on language", | |
| "Setting a minimum word count threshold", | |
| "Extracting information from the title and abstract", | |
| "Identifying the majority language in the dataset", | |
| "Counting the number of paragraphs in each document", | |
| "Calculating the frequency of each word in the dataset", | |
| "Calculating the log probability of each unigram", | |
| ], | |
| "Need": [ | |
| "To remove documents in unwanted languages", | |
| "To filter out documents with very few words", | |
| "To extract relevant information for analysis", | |
| "To understand the distribution of languages in the dataset", | |
| "To analyze the structure and length of documents", | |
| "To identify important words in the dataset", | |
| "To measure the significance of individual words", | |
| ], | |
| "Pros": [ | |
| "Improves data quality by removing irrelevant documents", | |
| "Filters out low-quality or incomplete documents", | |
| "Provides additional information for analysis", | |
| "Enables language-specific analysis and insights", | |
| "Helps understand the complexity and content of documents", | |
| "Identifies important terms and topics in the dataset", | |
| "Quantifies the importance of individual words", | |
| ], | |
| "Cons": [ | |
| "May exclude documents in less common languages", | |
| "May remove documents with valuable information", | |
| "May introduce bias in the analysis", | |
| "May not accurately represent the language distribution", | |
| "May not capture the complexity of document structure", | |
| "May be sensitive to noise and outliers", | |
| "May not capture the semantic meaning of words", | |
| ], | |
| } | |
| ) | |
| table_html = preprocessing_steps.to_html(index=False, border=0) | |
| table_div = Div(NotStr(table_html), style="margin: 40px;") | |
| data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div) | |
| return Div( | |
| Section( | |
| H2("Curated Sources"), | |
| plotly2fasthtml(get_chart_28168342()), | |
| data_preparation_div, | |
| data_preprocessing_div, | |
| id="inner-text", | |
| ) | |
| ) | |
| def common_steps(): | |
| return Div(Section(H2(P("Common Steps")), id="inner-text")) | |
| def results(): | |
| return Div(Section(H2(P("Results")), id="inner-text")) | |
| serve() | |