Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Upload 3 files
Browse files- README.md +5 -5
- app.py +206 -0
- requirements.txt +6 -0
    	
        README.md
    CHANGED
    
    | @@ -1,12 +1,12 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title: Dataset  | 
| 3 | 
            -
            emoji:  | 
| 4 | 
            -
            colorFrom:  | 
| 5 | 
            -
            colorTo:  | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 4.36.1
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
            -
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Dataset Insights Explorer
         | 
| 3 | 
            +
            emoji: π»
         | 
| 4 | 
            +
            colorFrom: gray
         | 
| 5 | 
            +
            colorTo: pink
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 4.36.1
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
            +
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,206 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            TODOS:
         | 
| 3 | 
            +
            - Improve prompts
         | 
| 4 | 
            +
            - Improve model usage (Quantization?)
         | 
| 5 | 
            +
            - Improve error handling
         | 
| 6 | 
            +
            - Add more tests
         | 
| 7 | 
            +
            - Improve response in a friendly way
         | 
| 8 | 
            +
            """
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            import gradio as gr
         | 
| 11 | 
            +
            from gradio_huggingfacehub_search import HuggingfaceHubSearch
         | 
| 12 | 
            +
            import duckdb
         | 
| 13 | 
            +
            import pandas as pd
         | 
| 14 | 
            +
            import requests
         | 
| 15 | 
            +
            from outlines import prompt
         | 
| 16 | 
            +
            from transformers import AutoTokenizer, AutoModelForCausalLM
         | 
| 17 | 
            +
            import spaces
         | 
| 18 | 
            +
            import json
         | 
| 19 | 
            +
            import torch
         | 
| 20 | 
            +
            import logging
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
         | 
| 23 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            """
         | 
| 26 | 
            +
            Methods for generating potential questions and SQL queries
         | 
| 27 | 
            +
            """
         | 
| 28 | 
            +
            device = "cuda"
         | 
| 29 | 
            +
            gemma_model_id = "google/gemma-2b-it"
         | 
| 30 | 
            +
            gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id)
         | 
| 31 | 
            +
            gemma_model = AutoModelForCausalLM.from_pretrained(
         | 
| 32 | 
            +
                gemma_model_id,
         | 
| 33 | 
            +
                device_map="auto",
         | 
| 34 | 
            +
                torch_dtype=torch.bfloat16
         | 
| 35 | 
            +
            )
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            @spaces.GPU
         | 
| 38 | 
            +
            def generate_potential_questions_with_gemma(prompt):
         | 
| 39 | 
            +
                input_ids = gemma_tokenizer(prompt, return_tensors="pt").to(device)
         | 
| 40 | 
            +
                outputs = gemma_model.generate(**input_ids, max_new_tokens=1024)
         | 
| 41 | 
            +
                return gemma_tokenizer.decode(outputs[0], skip_special_tokens=True)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            @prompt
         | 
| 45 | 
            +
            def prompt_for_questions(dataset, schema, first_rows):
         | 
| 46 | 
            +
                """
         | 
| 47 | 
            +
                You are a data analyst tasked with exploring a dataset named {{ dataset }}.
         | 
| 48 | 
            +
                Below is the dataset schema in SQL format along with a sample of 3 rows:
         | 
| 49 | 
            +
                {{ schema }}
         | 
| 50 | 
            +
                Sample rows:
         | 
| 51 | 
            +
                {% for example in first_rows %}
         | 
| 52 | 
            +
                {{ example}}
         | 
| 53 | 
            +
                {% endfor %}
         | 
| 54 | 
            +
                Your goal is to generate a list of 5 potential questions that a user might want
         | 
| 55 | 
            +
                to ask about this dataset. Consider the information contained in the provided
         | 
| 56 | 
            +
                columns and rows, and try to think of meaningful questions that could
         | 
| 57 | 
            +
                provide insights or useful information. For each question, provide the SQL query
         | 
| 58 | 
            +
                that would extract the relevant information from the dataset.
         | 
| 59 | 
            +
                Ouput JSON format:
         | 
| 60 | 
            +
                {
         | 
| 61 | 
            +
                    "questions": [
         | 
| 62 | 
            +
                        {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
         | 
| 63 | 
            +
                        {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
         | 
| 64 | 
            +
                        {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
         | 
| 65 | 
            +
                        {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
         | 
| 66 | 
            +
                        {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
         | 
| 67 | 
            +
                    ]
         | 
| 68 | 
            +
                }
         | 
| 69 | 
            +
                Please ensure that each SQL query retrieves relevant information from the dataset to answer the corresponding question accurately.
         | 
| 70 | 
            +
                Return only the JSON object, do not add extra information.
         | 
| 71 | 
            +
                """
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            """
         | 
| 74 | 
            +
            Methods for generating and SQL based on a user request 
         | 
| 75 | 
            +
            """
         | 
| 76 | 
            +
            mother_duckdb_model_id = "motherduckdb/DuckDB-NSQL-7B-v0.1"
         | 
| 77 | 
            +
            mother_duck_tokenizer = AutoTokenizer.from_pretrained(mother_duckdb_model_id)
         | 
| 78 | 
            +
            mother_duck_model = AutoModelForCausalLM.from_pretrained(
         | 
| 79 | 
            +
                mother_duckdb_model_id,
         | 
| 80 | 
            +
                device_map="auto",
         | 
| 81 | 
            +
                torch_dtype=torch.bfloat16
         | 
| 82 | 
            +
            )
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            @spaces.GPU
         | 
| 85 | 
            +
            def generate_sql_with_mother_duck(prompt):
         | 
| 86 | 
            +
                input_ids = mother_duck_tokenizer(prompt, return_tensors="pt").to(device).input_ids
         | 
| 87 | 
            +
                generated_ids = mother_duck_model.generate(input_ids, max_length=1024)
         | 
| 88 | 
            +
                return mother_duck_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         | 
| 89 | 
            +
             | 
| 90 | 
            +
             | 
| 91 | 
            +
            @prompt
         | 
| 92 | 
            +
            def prompt_for_sql(ddl_create, query_input):
         | 
| 93 | 
            +
                """
         | 
| 94 | 
            +
                ### Instruction:
         | 
| 95 | 
            +
                Your task is to generate valid duckdb SQL to answer the following question.
         | 
| 96 | 
            +
                ### Input:
         | 
| 97 | 
            +
                Here is the database schema that the SQL query will run on:
         | 
| 98 | 
            +
                {{ ddl_create }}
         | 
| 99 | 
            +
                
         | 
| 100 | 
            +
                ### Question:
         | 
| 101 | 
            +
                {{ query_input }}
         | 
| 102 | 
            +
                ### Response (use duckdb shorthand if possible):
         | 
| 103 | 
            +
                """
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            """
         | 
| 107 | 
            +
            Datasets Viewer Methods
         | 
| 108 | 
            +
            https://huggingface.co/docs/datasets-server/index
         | 
| 109 | 
            +
            """
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            def get_first_parquet(dataset: str):
         | 
| 112 | 
            +
                resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
         | 
| 113 | 
            +
                return resp.json()["parquet_files"][0]
         | 
| 114 | 
            +
             | 
| 115 | 
            +
             | 
| 116 | 
            +
            def get_dataset_schema(parquet_url: str):
         | 
| 117 | 
            +
                con = duckdb.connect()
         | 
| 118 | 
            +
                con.execute(f"CREATE TABLE data as SELECT * FROM '{parquet_url}' LIMIT 1;")
         | 
| 119 | 
            +
                result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
         | 
| 120 | 
            +
                ddl_create = result.iloc[0,0]
         | 
| 121 | 
            +
                con.close()
         | 
| 122 | 
            +
                return ddl_create
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
            def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
         | 
| 126 | 
            +
                resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
         | 
| 127 | 
            +
                rows = resp.json()["rows"]
         | 
| 128 | 
            +
                rows = [row['row'] for row in rows]
         | 
| 129 | 
            +
                return pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            """
         | 
| 132 | 
            +
            Main logic, to get the recommended queries
         | 
| 133 | 
            +
            """
         | 
| 134 | 
            +
            def get_recommended_queries(dataset: str):
         | 
| 135 | 
            +
                ddl_create, prompt = "", ""
         | 
| 136 | 
            +
                try:
         | 
| 137 | 
            +
                    first_split = get_first_parquet(dataset)
         | 
| 138 | 
            +
                    df_first_rows = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 3)
         | 
| 139 | 
            +
                    first_parquet_url = first_split["url"]
         | 
| 140 | 
            +
                    logger.info(f"First parquet URL: {first_parquet_url}")
         | 
| 141 | 
            +
                    ddl_create = get_dataset_schema(first_parquet_url)
         | 
| 142 | 
            +
                    prompt = prompt_for_questions(dataset, ddl_create, df_first_rows.to_dict('records'))
         | 
| 143 | 
            +
                    txt_questions = generate_potential_questions_with_gemma(prompt).split("``json")[1].replace('\n', ' ').strip()[:-4]
         | 
| 144 | 
            +
                    data = json.loads(txt_questions)
         | 
| 145 | 
            +
                    questions = data["questions"]
         | 
| 146 | 
            +
                    potential_questions = []
         | 
| 147 | 
            +
                    for question in questions:
         | 
| 148 | 
            +
                        try:
         | 
| 149 | 
            +
                            sql = question["sql_query"].replace("FROM data", f"FROM '{first_parquet_url}'")
         | 
| 150 | 
            +
                            result = duckdb.sql(sql).df()
         | 
| 151 | 
            +
                            potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
         | 
| 152 | 
            +
                            continue
         | 
| 153 | 
            +
                        except Exception as err:
         | 
| 154 | 
            +
                            logger.error(f"Error in running SQL query: {question['sql_query']} {err}")
         | 
| 155 | 
            +
                            mother_duck_prompt = prompt_for_sql(ddl_create, question["question"])
         | 
| 156 | 
            +
                            sql = generate_sql_with_mother_duck(mother_duck_prompt).split("### Response (use duckdb shorthand if possible):")[-1].strip()
         | 
| 157 | 
            +
                            sql = sql.replace("FROM data", f"FROM '{first_parquet_url}'")
         | 
| 158 | 
            +
                            try:
         | 
| 159 | 
            +
                                result = duckdb.sql(sql).df()
         | 
| 160 | 
            +
                                potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
         | 
| 161 | 
            +
                            except:
         | 
| 162 | 
            +
                                pass
         | 
| 163 | 
            +
                    df_result = pd.DataFrame(potential_questions)
         | 
| 164 | 
            +
                except Exception as err:
         | 
| 165 | 
            +
                    logger.error(f"Error in getting recommended queries: {err}")
         | 
| 166 | 
            +
                    return {
         | 
| 167 | 
            +
                        gr_txt_ddl: ddl_create,
         | 
| 168 | 
            +
                        gr_txt_prompt: prompt,
         | 
| 169 | 
            +
                        gr_df_result: pd.DataFrame([{"error": f"β {err=}"}])
         | 
| 170 | 
            +
                    }
         | 
| 171 | 
            +
                return {
         | 
| 172 | 
            +
                    gr_txt_ddl: ddl_create,
         | 
| 173 | 
            +
                    gr_txt_prompt: prompt,
         | 
| 174 | 
            +
                    gr_df_result: df_result
         | 
| 175 | 
            +
                }
         | 
| 176 | 
            +
             | 
| 177 | 
            +
             | 
| 178 | 
            +
            def preview_dataset(dataset: str):
         | 
| 179 | 
            +
                try:
         | 
| 180 | 
            +
                    first_split = get_first_parquet(dataset)
         | 
| 181 | 
            +
                    df = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 4)
         | 
| 182 | 
            +
                except Exception as err:
         | 
| 183 | 
            +
                    df = pd.DataFrame([{"Unable to preview dataset": f"β {err=}"}])
         | 
| 184 | 
            +
                return {
         | 
| 185 | 
            +
                    gr_df_first_rows: df
         | 
| 186 | 
            +
                }
         | 
| 187 | 
            +
             | 
| 188 | 
            +
             | 
| 189 | 
            +
            with gr.Blocks() as demo:
         | 
| 190 | 
            +
                gr.Markdown("# π« Dataset Insights Explorer π«")
         | 
| 191 | 
            +
                gr_dataset_name = HuggingfaceHubSearch(
         | 
| 192 | 
            +
                        label="Hub Dataset ID",
         | 
| 193 | 
            +
                        placeholder="Search for dataset id on Huggingface",
         | 
| 194 | 
            +
                        search_type="dataset",
         | 
| 195 | 
            +
                        value="jamescalam/world-cities-geo",
         | 
| 196 | 
            +
                    )
         | 
| 197 | 
            +
                gr_preview_btn = gr.Button("Preview Dataset")
         | 
| 198 | 
            +
                gr_df_first_rows = gr.DataFrame(datatype="markdown")
         | 
| 199 | 
            +
                gr_recommend_btn = gr.Button("Show Insights")
         | 
| 200 | 
            +
                gr_df_result = gr.DataFrame(datatype="markdown")
         | 
| 201 | 
            +
                with gr.Accordion("Open for details", open=False):
         | 
| 202 | 
            +
                    gr_txt_ddl = gr.Textbox(label="Dataset as CREATE DDL", interactive= False)
         | 
| 203 | 
            +
                    gr_txt_prompt = gr.Textbox(label="Generated prompt to get recommended questions", interactive= False)
         | 
| 204 | 
            +
                gr_preview_btn.click(preview_dataset, inputs=[gr_dataset_name], outputs=[gr_df_first_rows])
         | 
| 205 | 
            +
                gr_recommend_btn.click(get_recommended_queries, inputs=[gr_dataset_name], outputs=[gr_txt_ddl, gr_txt_prompt, gr_df_result])
         | 
| 206 | 
            +
            demo.launch()
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            gradio_huggingfacehub_search==0.0.7
         | 
| 2 | 
            +
            duckdb
         | 
| 3 | 
            +
            pandas
         | 
| 4 | 
            +
            outlines
         | 
| 5 | 
            +
            transformers
         | 
| 6 | 
            +
            accelerate
         | 
