Spaces:

miracle-ema
/

DataViz

Running

App Files Files Community

miracle-ema commited on Mar 27

Commit

713c79c

verified ·

1 Parent(s): f6dde48

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +32 -0
app.py +286 -0
requirements.txt +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+## use the official python 3.9 image
+FROM python:3.9
+## set the working directory to /code
+WORKDIR /code
+## copy the current directory contents into the container at /code
+COPY ./requirements.txt /code/requirements.txt
+## Install the requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# set up a new user named "user"
+RUN useradd user
+# Switch to the "user" user
+USER user
+# set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# set the working directory to the user's home directory
+WORKDIR $HOME/app
+# copy the current directory contents into the container at $HOME/app setting the user as the owner to avoid permission issues
+COPY --chown=user . $HOME/app
+## Start the FASTAPI App on the port 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import logging
+from huggingface_hub import InferenceClient
+from dotenv import load_dotenv
+import hashlib
+import ast
+import re
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+load_dotenv()
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+API_TOKEN = os.getenv("HF_TOKEN")
+if not API_TOKEN:
+    raise ValueError("HUGGINGFACE_API_TOKEN environment variable not set.")
+MODEL_NAME = "bigcode/starcoder"
+client = InferenceClient(model=MODEL_NAME, token=API_TOKEN)
+UPLOAD_DIR = "uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+IMAGES_DIR = os.path.join("../static", "images")
+os.makedirs(IMAGES_DIR, exist_ok=True)
+def detect_plot_type(prompt):
+    """Detect the requested plot type from the prompt."""
+    prompt_lower = prompt.lower()
+    if "bar" in prompt_lower:
+        return "bar"
+    elif "histogram" in prompt_lower or "distribution" in prompt_lower:
+        return "histogram"
+    elif "line" in prompt_lower:
+        return "line"
+    else:
+        return "scatter"
+@app.post("/upload/")
+async def upload_file(file: UploadFile = File(...)):
+    if not file.filename.endswith(".xlsx"):
+        raise HTTPException(status_code=400, detail="File must be an Excel file (.xlsx)")
+    file_path = os.path.join(UPLOAD_DIR, file.filename)
+    with open(file_path, "wb") as buffer:
+        buffer.write(await file.read())
+    logger.info(f"File uploaded: {file.filename}")
+    return {"filename": file.filename}
+@app.post("/generate-visualization/")
+async def generate_visualization(prompt: str = Form(...), filename: str = Form(...)):
+    file_path = os.path.join(UPLOAD_DIR, filename)
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found on server.")
+    try:
+        df = pd.read_excel(file_path)
+        if df.empty:
+            raise ValueError("Excel file is empty.")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Error reading Excel file: {str(e)}")
+    plot_type = detect_plot_type(prompt)
+    allow_groupby = "average" in prompt.lower() or "mean" in prompt.lower()
+    input_text = f"""
+You are a Python code generator specializing in data visualization. The DataFrame 'df' is already loaded from an Excel file '{filename}' with columns {', '.join(df.columns)}.
+The user requests: '{prompt}'.
+Instructions:
+- Generate Python code to create a {plot_type} plot based on the user's natural language prompt using the pre-loaded DataFrame 'df', pandas (pd), matplotlib.pyplot (plt), and seaborn (sns).
+- Include the following imports at the top of the code, preceded by a comment:
+  # import libraries
+  import pandas as pd
+  import matplotlib.pyplot as plt
+  import seaborn as sns
+- Include a line to read the DataFrame, preceded by a comment (even though it will be removed during execution):
+  # load data
+  df = pd.read_excel('{filename}')
+- Add xlabel and ylabel using human-readable forms inferred from the prompt (e.g., 'Petal Length' if the prompt mentions "petal length").
+- Add a title using plt.title(). Format based on plot type and prompt context:
+  - Scatter: "<X> vs <Y>" or "<X> vs <Y> by <Hue>" if "colored by" is present
+  - Bar: "<Y> by <X>" or "Average <Y> by <X>" if averages are requested
+  - Histogram: "Distribution of <X>"
+  - Line: "<Y> by <X>" or "<X> vs <Y>"
+- For averages, use df.groupby().mean() if "average" or "mean" is in the prompt.
+- Plot type specifics:
+  - Scatter: Use sns.scatterplot with hue=<column> if "colored by" is present, else plt.scatter
+  - Bar: Use sns.barplot; apply groupby if averages are requested
+  - Histogram: Use sns.histplot
+  - Line: Use sns.lineplot
+- Automatically infer column names from the prompt and match them to the exact DataFrame columns ({', '.join(df.columns)}) based on context. Use the exact column names as they appear in the DataFrame.
+- Include plt.show() at the end (will be removed during execution).
+- Output only the Python code as valid Python.
+Examples:
+  - For "Create a scatter plot of column1 vs column2":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.scatterplot(x='column1', y='column2', data=df)
+    plt.xlabel('Column 1')
+    plt.ylabel('Column 2')
+    plt.title('Column 1 vs Column 2')
+    plt.show()
+  - For "Create a scatter plot of column1 vs column2 colored by column3":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.scatterplot(x='column1', y='column2', hue='column3', data=df)
+    plt.xlabel('Column 1')
+    plt.ylabel('Column 2')
+    plt.title('Column 1 vs Column 2 by Column3')
+    plt.show()
+  - For "Create a bar chart of column1 by column2":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.barplot(x='column2', y='column1', data=df)
+    plt.xlabel('Column 2')
+    plt.ylabel('Column 1')
+    plt.title('Column 1 by Column 2')
+    plt.show()
+  - For "Create a bar chart of average column1 by column2":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.barplot(x='column2', y='column1', data=df.groupby('column2').mean().reset_index())
+    plt.xlabel('Column 2')
+    plt.ylabel('Average Column 1')
+    plt.title('Average Column 1 by Column 2')
+    plt.show()
+  - For "Create a histogram of column1":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.histplot(df['column1'])
+    plt.xlabel('Column 1')
+    plt.ylabel('Frequency')
+    plt.title('Distribution of Column 1')
+    plt.show()
+  - For "Create a line chart of column1 by column2":
+    # import libraries
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    # load data
+    df = pd.read_excel('{filename}')
+    sns.lineplot(x='column2', y='column1', data=df)
+    plt.xlabel('Column 2')
+    plt.ylabel('Column 1')
+    plt.title('Column 1 by Column 2')
+    plt.show()
+Generate the code for the user's request now. Output only the Python code, nothing else:
+"""
+    try:
+        raw_generated_code = client.text_generation(input_text, max_new_tokens=400)
+        logger.info(f"Raw generated code: '{raw_generated_code}'")
+    except Exception as e:
+        logger.error(f"Error querying model: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error querying model: {str(e)}")
+    if not raw_generated_code.strip():
+        logger.error("No code generated by the AI model.")
+        raise HTTPException(status_code=500, detail="No code generated by the AI model.")
+    cleaned_code = raw_generated_code.strip().replace('```', '').replace('"""', '').replace("'''", '')
+    lines = cleaned_code.splitlines()
+    cleaned_code = "\n".join(
+        line.strip() for line in lines
+        if line.strip()
+        and not line.strip().startswith(('#', 'def', 'class', 'import', 'df ='))
+        and not any(kw in line for kw in ["pd.read_csv", "pd.read_excel", "http", "raise", "print", "plt.show"])
+        and not re.match(r'^\s*\d+\s+.*$', line)
+        and not re.match(r'^\s*$$   .*rows.*columns   $$\s*$', line)
+    ).strip()
+    logger.info(f"Cleaned code: '{cleaned_code}'")
+    if not cleaned_code:
+        logger.error("Cleaned code is empty after filtering.")
+        raise HTTPException(status_code=500, detail="Generated code is empty or contains only disallowed content")
+    try:
+        ast.parse(cleaned_code)
+    except SyntaxError as e:
+        logger.error(f"Syntax error in cleaned code: '{cleaned_code}' Exception: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Syntax error in generated code: {str(e)}")
+    plot_hash = hashlib.md5(f"{filename}_{prompt}".encode()).hexdigest()[:8]
+    plot_filename = f"plot_{plot_hash}.png"
+    plot_path = os.path.join(IMAGES_DIR, plot_filename)
+    try:
+        exec_globals = {"pd": pd, "plt": plt, "sns": sns, "df": df}
+        plt.close('all')
+        plt.clf()
+        plt.cla()
+        fig = plt.figure(figsize=(8, 6))
+        exec(cleaned_code, exec_globals)
+        if not fig.get_axes():
+            plt.close('all')
+            raise ValueError("Generated code produced an empty plot")
+        plt.savefig(plot_path, bbox_inches="tight")
+        logger.info(f"Plot saved to {plot_path}")
+        plt.close('all')
+    except Exception as e:
+        plt.close('all')
+        logger.error(f"Error executing cleaned code: '{cleaned_code}' Exception: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error executing code: {str(e)}")
+    if not os.path.exists(plot_path):
+        raise HTTPException(status_code=500, detail="Plot file was not created.")
+    plot_url = f"/static/images/{plot_filename}?t={int(pd.Timestamp.now().timestamp())}"
+    return {"plot_url": plot_url, "generated_code": raw_generated_code}
+@app.get("/")
+async def serve_frontend():
+    with open("static/index.html", "r") as f:
+        return HTMLResponse(content=f.read())

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.115.0
+uvicorn==0.30.6
+pandas==2.2.2
+matplotlib==3.9.4
+seaborn==0.13.2
+python-multipart==0.0.9
+transformers==4.45.2
+torch==2.4.1
+openpyxl==3.1.5
+python-dotenv==1.0.1
+huggingface_hub==0.23.4