sinan7 commited on
Commit
6a74c0e
·
verified ·
1 Parent(s): 79898af

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +24 -0
  2. main.py +132 -0
  3. requirements.txt +0 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python runtime
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /code
6
+
7
+ # Copy the requirements file and install dependencies
8
+ COPY ./requirements.txt /code/requirements.txt
9
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
+
11
+ # Create writable cache directory for Hugging Face models
12
+ RUN mkdir -p /code/hf_cache && chmod -R 777 /code/hf_cache
13
+
14
+ # Set environment variables for Hugging Face cache
15
+ ENV HF_HOME=/code/hf_cache
16
+
17
+ # Copy the application code
18
+ COPY ./main.py /code/main.py
19
+
20
+ # Expose the application port
21
+ EXPOSE 7860
22
+
23
+ # Start the FastAPI application
24
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "300"]
main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
+ from pydantic import BaseModel
3
+ import tempfile
4
+ import os
5
+ import fitz # PyMuPDF for PDF handling
6
+ import torch
7
+ import json
8
+ from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
9
+ import logging
10
+
11
+ # Initialize FastAPI app
12
+ app = FastAPI()
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Initialize the GPT2-medium pipeline
19
+ qa_pipeline = pipeline(
20
+ "text-generation",
21
+ model="gpt2-medium", # Switching to GPT2-medium
22
+ tokenizer="gpt2-medium",
23
+ device=-1 # Use CPU
24
+ )
25
+
26
+ class Education(BaseModel):
27
+ degree: str
28
+ university: str
29
+ graduation_year: str
30
+
31
+ class ExtractedInfo(BaseModel):
32
+ work_experience: str
33
+ education: Education
34
+ professional_course_detail: str
35
+ software_usage: str
36
+ safety_course_detail: str
37
+ hse_description: str
38
+ good_conduct_certificate: str
39
+
40
+ def extract_text_from_pdf(pdf_path: str) -> str:
41
+ """Extracts text from a PDF file."""
42
+ with fitz.open(pdf_path) as doc:
43
+ text = "".join([page.get_text() for page in doc])
44
+ if not text.strip():
45
+ raise HTTPException(status_code=400, detail="PDF contains no extractable text.")
46
+ return text
47
+
48
+ def chunk_text(text: str, max_tokens: int = 900) -> list:
49
+ """Splits the text into chunks that fit the model token limit."""
50
+ tokens = qa_pipeline.tokenizer.encode(text)
51
+ chunks = [
52
+ qa_pipeline.tokenizer.decode(tokens[i:i + max_tokens])
53
+ for i in range(0, len(tokens), max_tokens)
54
+ ]
55
+ return chunks
56
+
57
+ def generate_structured_output(text: str) -> dict:
58
+ """Generates structured output from the text using GPT2-medium."""
59
+ chunks = chunk_text(text)
60
+
61
+ # Collect results from each chunk
62
+ generated_text = ""
63
+ for chunk in chunks:
64
+ prompt = f"""
65
+ Extract the following information from the resume in JSON format:
66
+ {{
67
+ "work_experience": "<Summarized single work experience>",
68
+ "education": {{
69
+ "degree": "<Degree obtained>",
70
+ "university": "<University attended>",
71
+ "graduation_year": "<Year of graduation>"
72
+ }},
73
+ "professional_course_detail": "<Details of professional courses completed>",
74
+ "software_usage": "<List of software tools used>",
75
+ "safety_course_detail": "<Safety courses completed>",
76
+ "hse_description": "<HSE (Health, Safety, Environment) practices>",
77
+ "good_conduct_certificate": "<Details of good conduct certificate>"
78
+ }}
79
+ Resume text:
80
+ {chunk}
81
+ """
82
+
83
+ response = qa_pipeline(prompt, max_new_tokens=300, temperature=0.7)
84
+ generated_text += response[0]["generated_text"]
85
+
86
+ # Extract JSON from the generated text
87
+ try:
88
+ json_start = generated_text.find("{")
89
+ json_end = generated_text.rfind("}") + 1
90
+ if json_start != -1 and json_end != -1:
91
+ json_str = generated_text[json_start:json_end]
92
+ return json.loads(json_str)
93
+ else:
94
+ raise ValueError("No valid JSON found in the model output")
95
+ except Exception as e:
96
+ logger.error(f"Error generating structured output: {e}")
97
+ raise HTTPException(status_code=500, detail="Failed to generate structured output.")
98
+
99
+ @app.post("/process_cv/", response_model=ExtractedInfo)
100
+ async def process_cv(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
101
+ """Processes a PDF resume and extracts structured information."""
102
+ background_tasks.add_task(clean_temp_files)
103
+
104
+ if not file.filename.lower().endswith(".pdf"):
105
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
106
+
107
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
108
+ content = await file.read()
109
+ temp_file.write(content)
110
+ temp_path = temp_file.name
111
+
112
+ try:
113
+ text = extract_text_from_pdf(temp_path)
114
+ structured_data = generate_structured_output(text)
115
+ return ExtractedInfo(**structured_data)
116
+ except Exception as e:
117
+ logger.error(f"Unexpected error: {e}")
118
+ raise HTTPException(status_code=500, detail="An error occurred while processing the PDF.")
119
+ finally:
120
+ if os.path.exists(temp_path):
121
+ os.remove(temp_path)
122
+
123
+ def clean_temp_files():
124
+ """Cleans up old temporary files."""
125
+ temp_dir = tempfile.gettempdir()
126
+ for filename in os.listdir(temp_dir):
127
+ if filename.endswith(".pdf"):
128
+ try:
129
+ os.remove(os.path.join(temp_dir, filename))
130
+ logger.info(f"Deleted temporary file: {filename}")
131
+ except Exception as e:
132
+ logger.warning(f"Failed to delete {filename}: {e}")
requirements.txt ADDED
Binary file (296 Bytes). View file