Spaces:

sinan7
/

extract_api

Runtime error

App Files Files Community

sinan7 commited on Oct 15, 2024

Commit

6a74c0e

verified ·

1 Parent(s): 79898af

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +24 -0
main.py +132 -0
requirements.txt +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+# Use a lightweight Python runtime
+FROM python:3.11-slim
+# Set the working directory in the container
+WORKDIR /code
+# Copy the requirements file and install dependencies
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Create writable cache directory for Hugging Face models
+RUN mkdir -p /code/hf_cache && chmod -R 777 /code/hf_cache
+# Set environment variables for Hugging Face cache
+ENV HF_HOME=/code/hf_cache
+# Copy the application code
+COPY ./main.py /code/main.py
+# Expose the application port
+EXPOSE 7860
+# Start the FastAPI application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "300"]

main.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
+from pydantic import BaseModel
+import tempfile
+import os
+import fitz  # PyMuPDF for PDF handling
+import torch
+import json
+from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
+import logging
+# Initialize FastAPI app
+app = FastAPI()
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize the GPT2-medium pipeline
+qa_pipeline = pipeline(
+    "text-generation",
+    model="gpt2-medium",  # Switching to GPT2-medium
+    tokenizer="gpt2-medium",
+    device=-1  # Use CPU
+)
+class Education(BaseModel):
+    degree: str
+    university: str
+    graduation_year: str
+class ExtractedInfo(BaseModel):
+    work_experience: str
+    education: Education
+    professional_course_detail: str
+    software_usage: str
+    safety_course_detail: str
+    hse_description: str
+    good_conduct_certificate: str
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extracts text from a PDF file."""
+    with fitz.open(pdf_path) as doc:
+        text = "".join([page.get_text() for page in doc])
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="PDF contains no extractable text.")
+    return text
+def chunk_text(text: str, max_tokens: int = 900) -> list:
+    """Splits the text into chunks that fit the model token limit."""
+    tokens = qa_pipeline.tokenizer.encode(text)
+    chunks = [
+        qa_pipeline.tokenizer.decode(tokens[i:i + max_tokens])
+        for i in range(0, len(tokens), max_tokens)
+    ]
+    return chunks
+def generate_structured_output(text: str) -> dict:
+    """Generates structured output from the text using GPT2-medium."""
+    chunks = chunk_text(text)
+    # Collect results from each chunk
+    generated_text = ""
+    for chunk in chunks:
+        prompt = f"""
+        Extract the following information from the resume in JSON format:
+        {{
+            "work_experience": "<Summarized single work experience>",
+            "education": {{
+                "degree": "<Degree obtained>",
+                "university": "<University attended>",
+                "graduation_year": "<Year of graduation>"
+            }},
+            "professional_course_detail": "<Details of professional courses completed>",
+            "software_usage": "<List of software tools used>",
+            "safety_course_detail": "<Safety courses completed>",
+            "hse_description": "<HSE (Health, Safety, Environment) practices>",
+            "good_conduct_certificate": "<Details of good conduct certificate>"
+        }}
+        Resume text:
+        {chunk}
+        """
+        response = qa_pipeline(prompt, max_new_tokens=300, temperature=0.7)
+        generated_text += response[0]["generated_text"]
+    # Extract JSON from the generated text
+    try:
+        json_start = generated_text.find("{")
+        json_end = generated_text.rfind("}") + 1
+        if json_start != -1 and json_end != -1:
+            json_str = generated_text[json_start:json_end]
+            return json.loads(json_str)
+        else:
+            raise ValueError("No valid JSON found in the model output")
+    except Exception as e:
+        logger.error(f"Error generating structured output: {e}")
+        raise HTTPException(status_code=500, detail="Failed to generate structured output.")
+@app.post("/process_cv/", response_model=ExtractedInfo)
+async def process_cv(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
+    """Processes a PDF resume and extracts structured information."""
+    background_tasks.add_task(clean_temp_files)
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+        content = await file.read()
+        temp_file.write(content)
+        temp_path = temp_file.name
+    try:
+        text = extract_text_from_pdf(temp_path)
+        structured_data = generate_structured_output(text)
+        return ExtractedInfo(**structured_data)
+    except Exception as e:
+        logger.error(f"Unexpected error: {e}")
+        raise HTTPException(status_code=500, detail="An error occurred while processing the PDF.")
+    finally:
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+def clean_temp_files():
+    """Cleans up old temporary files."""
+    temp_dir = tempfile.gettempdir()
+    for filename in os.listdir(temp_dir):
+        if filename.endswith(".pdf"):
+            try:
+                os.remove(os.path.join(temp_dir, filename))
+                logger.info(f"Deleted temporary file: {filename}")
+            except Exception as e:
+                logger.warning(f"Failed to delete {filename}: {e}")

requirements.txt ADDED Viewed

Binary file (296 Bytes). View file