Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| import pandas as pd | |
| from datasets import load_dataset | |
| st.set_page_config(page_title="The Stack data Inspection", layout="wide") | |
| st.title("The Stack data Inspection") | |
| df = pd.read_csv("extension_distribution.csv") | |
| all_extensions = df["extension"].tolist() | |
| tags = {} | |
| for index, row in df.iterrows(): | |
| if row["language"] not in tags: | |
| tags[row["language"]] = [] | |
| tags[row["language"]].append(row["extension"]) | |
| all_languages = list(tags.keys()) | |
| def load_data(language, ext): | |
| ds = load_dataset( | |
| "loubnabnl/the-stack-inspection-data", | |
| data_dir=f"data/{language}/{ext}", | |
| split="train", | |
| ) | |
| return ds | |
| col1, col2, _ = st.columns([1, 1, 4]) | |
| with col1: | |
| chosen_language = st.selectbox( | |
| label="Select a programming language", options=all_languages, index=0 | |
| ) | |
| with col2: | |
| chosen_ext = st.selectbox( | |
| label="Select an extension", options=tags[chosen_language], index=0 | |
| ) | |
| # load the dataset and get indexes of non lexable files | |
| samples = load_data(chosen_language, chosen_ext) | |
| max_docs = len(samples) | |
| samples = samples.add_column("idx", range(len(samples))) | |
| not_lexed = samples.filter(lambda x: not x["lexable"]) | |
| indexes_not_lexed = not_lexed["idx"] | |
| # info about extension | |
| st.markdown("### Information about the extension:") | |
| text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \ | |
| {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \ | |
| are not lexable. These files are at indexes: {indexes_not_lexed}." | |
| st.markdown(text) | |
| col_1, _ = st.columns([2, 4]) | |
| with col_1: | |
| index_example = st.number_input( | |
| f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", | |
| min_value=0, | |
| max_value=max_docs - 1, | |
| value=0, | |
| step=1, | |
| ) | |
| # info about the chosen example | |
| example = samples[index_example] | |
| st.markdown("#### Information about the chosen example:") | |
| text_alpha = "**has**" if example["long_lines"] else "doesn't have" | |
| text_lines = "**has**" if example["low_alphanum"] else "doesn't have" | |
| text_lexer = "is" if example["lexable"] else "**isn't**" | |
| st.markdown( | |
| f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \ | |
| {text_lines} very long lines, and {text_lexer} lexable." | |
| ) | |
| # display file content | |
| st.markdown("#### File content:") | |
| if not example["lexable"]: | |
| st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n {example['content']}") | |
| else: | |
| st.code(example["content"], language=chosen_language) | |