Spaces:

infinitymatter
/

Synthetic_Data_Generator_SRIJAN

Sleeping

App Files Files Community

infinitymatter commited on Apr 2

Commit

f0edb7b

verified ·

1 Parent(s): 40dad27

Upload 4 files

Browse files

Files changed (4) hide show

fetch_data.py +15 -0
generate_schema.py +44 -0
main.py +29 -0
synthetic_generator.py +69 -0

fetch_data.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import requests
+import pandas as pd
+from io import BytesIO
+from Utils.config import DATASET_URLS
+def fetch_real_data(domain):
+    url = DATASET_URLS.get(domain)
+    if not url:
+        raise ValueError(f"No URL found for domain: {domain}")
+    response = requests.get(url)
+    response.raise_for_status()
+    df = pd.read_csv(BytesIO(response.content))
+    return df

generate_schema.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import requests
+import json
+import os
+import os
+from dotenv import load_dotenv
+load_dotenv()
+API_KEY = os.getenv("hf_token")
+def generate_schema(user_prompt):
+    """ Generates a synthetic dataset schema using Hugging Face API. """
+    system_prompt = """
+You are an expert data scientist designing synthetic datasets.
+For any given dataset description, generate:
+- Column names
+- Data types (string, int, float, date)
+- Approximate row count
+Output in **pure JSON** format like:
+{
+    "columns": ["PatientID", "Age", "Gender", "Diagnosis"],
+    "types": ["int", "int", "string", "string"],
+    "size": 500
+}
+"""
+    payload = {
+        "inputs": system_prompt + "\n\nUser request: " + user_prompt,
+        "options": {"wait_for_model": True}
+    }
+    response = requests.post(HF_MODEL_URL, headers=HEADERS, json=payload)
+    if response.status_code == 200:
+        try:
+            output = response.json()[0]['generated_text']
+            schema = json.loads(output.strip())  # Convert to JSON
+            return schema
+        except json.JSONDecodeError:
+            return {"error": "Invalid JSON output from model. Try again."}
+    else:
+        return {"error": f"API request failed. Status Code: {response.status_code}"}

main.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import argparse
+import pandas as pd
+from generate_schema import generate_schema
+from fetch_data import fetch_real_data
+from synthetic_generator import train_and_generate_synthetic
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Describe the dataset you want")
+    parser.add_argument("--domain", type=str, default="healthcare", help="Domain to fetch real data from (optional)")
+    args = parser.parse_args()
+    # Step 1: Generate schema using LLM
+    schema = generate_schema(args.prompt)
+    print(f"📊 Generated schema: {schema}")
+    # Step 2: Fetch real data (optional)
+    real_data = fetch_real_data(args.domain)
+    # Step 3: Preprocess (if necessary)
+    real_data = real_data[schema['columns']]  # Match columns from schema
+    print(f"✅ Fetched real data with shape: {real_data.shape}")
+    # Step 4: Train GAN and generate synthetic data
+    output_path = f"outputs/synthetic_{args.domain}.csv"
+    train_and_generate_synthetic(real_data, schema, output_path)
+if __name__ == "__main__":
+    main()

synthetic_generator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+from ctgan import CTGAN
+from sklearn.preprocessing import LabelEncoder
+import os
+import json
+import requests
+def train_and_generate_synthetic(real_data, schema, output_path):
+    """Trains a CTGAN model and generates synthetic data."""
+    categorical_cols = [col for col, dtype in zip(schema['columns'], schema['types']) if dtype == 'string']
+    # Store label encoders
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        real_data[col] = le.fit_transform(real_data[col])
+        label_encoders[col] = le
+    # Train CTGAN
+    gan = CTGAN(epochs=300)
+    gan.fit(real_data, categorical_cols)
+    # Generate synthetic data
+    synthetic_data = gan.sample(schema['size'])
+    # Decode categorical columns
+    for col in categorical_cols:
+        synthetic_data[col] = label_encoders[col].inverse_transform(synthetic_data[col])
+    # Save to CSV
+    os.makedirs('outputs', exist_ok=True)
+    synthetic_data.to_csv(output_path, index=False)
+    print(f"✅ Synthetic data saved to {output_path}")
+def generate_schema(prompt):
+    """Fetches schema from an external API and validates JSON."""
+    API_URL = "https://api.example.com/schema"  # Replace with correct API URL
+    headers = {"Authorization": f"Bearer YOUR_HUGGINGFACE_TOKEN"}  # Add if needed
+    try:
+        response = requests.post(API_URL, json={"prompt": prompt}, headers=headers)
+        print("🔍 Raw API Response:", response.text)  # Debugging line
+        schema = response.json()
+        # Validate required keys
+        if 'columns' not in schema or 'types' not in schema or 'size' not in schema:
+            raise ValueError("❌ Invalid schema format! Expected keys: 'columns', 'types', 'size'")
+        print("✅ Valid Schema Received:", schema)  # Debugging line
+        return schema
+    except json.JSONDecodeError:
+        print("❌ Failed to parse JSON response. API might be down or returning non-JSON data.")
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"❌ API request failed: {e}")
+        return None
+def fetch_data(domain):
+    """Fetches real data for the given domain and ensures it's a valid DataFrame."""
+    data_path = f"datasets/{domain}.csv"
+    if os.path.exists(data_path):
+        df = pd.read_csv(data_path)
+        if not isinstance(df, pd.DataFrame) or df.empty:
+            raise ValueError("❌ Loaded data is invalid!")
+        return df
+    else:
+        raise FileNotFoundError(f"❌ Dataset for {domain} not found.")