Upload 6 files
Browse files- LICENSE +7 -0
- analytics.py +73 -0
- explore.ipynb +102 -0
- llm_integration.py +106 -0
- main.py +123 -0
- pdf_processing.py +29 -0
LICENSE
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright 2025, PASQUALE SALOMONE
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4 |
+
|
5 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6 |
+
|
7 |
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
analytics.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from datetime import datetime
|
3 |
+
from difflib import SequenceMatcher
|
4 |
+
from google.cloud import storage
|
5 |
+
from google.oauth2 import service_account
|
6 |
+
import os
|
7 |
+
|
8 |
+
# CONFIG
|
9 |
+
BUCKET_NAME = "post_generator1"
|
10 |
+
LOG_DIR = "analytics_logs"
|
11 |
+
MAX_HISTORY = 10
|
12 |
+
|
13 |
+
# Load credentials if GOOGLE_APPLICATION_CREDENTIALS contains JSON content
|
14 |
+
gcp_creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
15 |
+
|
16 |
+
if gcp_creds_env and gcp_creds_env.strip().startswith('{'):
|
17 |
+
# It's raw JSON content from Hugging Face secret or env var
|
18 |
+
creds_dict = json.loads(gcp_creds_env)
|
19 |
+
credentials = service_account.Credentials.from_service_account_info(creds_dict)
|
20 |
+
client = storage.Client(credentials=credentials, project=creds_dict.get("project_id"))
|
21 |
+
else:
|
22 |
+
# Fall back to default method (e.g. GOOGLE_APPLICATION_CREDENTIALS as a file path or local login)
|
23 |
+
client = storage.Client()
|
24 |
+
|
25 |
+
bucket = client.bucket(BUCKET_NAME)
|
26 |
+
def log_analytics(event_type: str, metadata: dict, content: str = None):
|
27 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
28 |
+
entry = {
|
29 |
+
"timestamp": timestamp,
|
30 |
+
"event_type": event_type,
|
31 |
+
"metadata": metadata
|
32 |
+
}
|
33 |
+
if content:
|
34 |
+
entry["content"] = content
|
35 |
+
|
36 |
+
# Create unique filename per event: analytics_logs/YYYY-MM-DD/HH-MM-SS_event.jsonl
|
37 |
+
date_folder = datetime.now().strftime('%Y-%m-%d')
|
38 |
+
time_stamp = datetime.now().strftime('%H-%M-%S-%f')[:-3] # Include milliseconds
|
39 |
+
log_filename = f"{LOG_DIR}/{date_folder}/{time_stamp}_{event_type}.jsonl"
|
40 |
+
|
41 |
+
blob = bucket.blob(log_filename)
|
42 |
+
blob.upload_from_string(json.dumps(entry) + "\n", content_type="application/jsonl")
|
43 |
+
|
44 |
+
|
45 |
+
def get_recent_generated_posts() -> list:
|
46 |
+
posts = []
|
47 |
+
blobs = list(client.list_blobs(BUCKET_NAME, prefix=LOG_DIR + "/"))
|
48 |
+
blobs = sorted(blobs, key=lambda b: b.name, reverse=True)
|
49 |
+
|
50 |
+
for blob in blobs:
|
51 |
+
if not blob.name.endswith(".jsonl"):
|
52 |
+
continue
|
53 |
+
|
54 |
+
content = blob.download_as_text()
|
55 |
+
lines = list(reversed(content.strip().splitlines()))
|
56 |
+
for line in lines:
|
57 |
+
try:
|
58 |
+
data = json.loads(line)
|
59 |
+
if data.get("event_type") == "generation" and "content" in data:
|
60 |
+
posts.append(data["content"])
|
61 |
+
if len(posts) >= MAX_HISTORY:
|
62 |
+
return posts
|
63 |
+
except json.JSONDecodeError:
|
64 |
+
continue
|
65 |
+
return posts
|
66 |
+
|
67 |
+
def is_too_similar(new_post: str, threshold: float = 0.85) -> bool:
|
68 |
+
recent_posts = get_recent_generated_posts()
|
69 |
+
for post in recent_posts:
|
70 |
+
similarity = SequenceMatcher(None, new_post, post).ratio()
|
71 |
+
if similarity > threshold:
|
72 |
+
return True
|
73 |
+
return False
|
explore.ipynb
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"id": "dd0f778e",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stdout",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"{'timestamp': '2025-06-19 13:53:30', 'event_type': 'generation', 'metadata': {'tone': 'Mario Bros Style', 'version': 'v2-Experimental with richer sentence variety and longer posts', 'length': 1739}, 'content': \"As I delved into the fascinating story of the Afro-Napoli United, a football team that brings together migrants and Italians, I couldn't help but wonder: can sports truly be a powerful tool for integration? 🤔\\n\\nThe research by Luca Bifulco and Adele Del Guercio highlights the complexities of integration, particularly in the context of the Campania region, where the team is based. The authors show how the Afro-Napoli United has not only promoted a multi-ethnic culture but has also influenced the lives of its players, fostering social capital and relational networks that are essential for accessing economic and social resources.\\n\\nWhat struck me was the team's ability to create a sense of belonging and pride among its players, many of whom are migrants. The authors note that the team's success has increased the players' local prestige and general trust, allowing them to access resources and services that might have been out of reach otherwise.\\n\\nAs I read through the document, I was struck by the importance of social capital in understanding social and economic outcomes. The authors draw on the work of Pierre Bourdieu and Robert Putnam, highlighting the role of social capital in facilitating integration and reducing socio-economic exclusion.\\n\\nThe Afro-Napoli United's story is a powerful reminder that sports can be a powerful tool for promoting integration and social cohesion. As the authors note, the team's experience is not without its challenges, but it offers valuable insights into the complex dynamics of integration and the role that sports can play in fostering social capital and relational networks.\\n\\nBased on work by Luca Bifulco and Adele Del Guercio (Publication Date Not Specified) #llm #sports #integration\"}\n",
|
14 |
+
"{'timestamp': '2025-06-19 13:54:05', 'event_type': 'feedback', 'metadata': {'sentiment': 'positive'}, 'content': \"As I delved into the fascinating story of the Afro-Napoli United, a football team that brings together migrants and Italians, I couldn't help but wonder: can sports truly be a powerful tool for integration? 🤔\\n\\nThe research by Luca Bifulco and Adele Del Guercio highlights the complexities of integration, particularly in the context of the Campania region, where the team is based. The authors show how the Afro-Napoli United has not only promoted a multi-ethnic culture but has also influenced the lives of its players, fostering social capital and relational networks that are essential for accessing economic and social resources.\\n\\nWhat struck me was the team's ability to create a sense of belonging and pride among its players, many of whom are migrants. The authors note that the team's success has increased the players' local prestige and general trust, allowing them to access resources and services that might have been out of reach otherwise.\\n\\nAs I read through the document, I was struck by the importance of social capital in understanding social and economic outcomes. The authors draw on the work of Pierre Bourdieu and Robert Putnam, highlighting the role of social capital in facilitating integration and reducing socio-economic exclusion.\\n\\nThe Afro-Napoli United's story is a powerful reminder that sports can be a powerful tool for promoting integration and social cohesion. As the authors note, the team's experience is not without its challenges, but it offers valuable insights into the complex dynamics of integration and the role that sports can play in fostering social capital and relational networks.\\n\\nBased on work by Luca Bifulco and Adele Del Guercio (Publication Date Not Specified) #llm #sports #integration\"}\n",
|
15 |
+
"{'timestamp': '2025-06-19 13:56:11', 'event_type': 'generation', 'metadata': {'tone': 'Professional', 'version': 'v1-Standard structure and tone', 'length': 1392}, 'content': \"I've read a fascinating document about the role of football in promoting integration and social capital among migrants. As someone who's passionate about understanding the complexities of social dynamics, I was particularly interested in the case study of Afro-Napoli United, a football team composed of migrant and Italian players.\\n\\nWhat struck me was the way the team's experience has influenced the lives of its players, not just in terms of their social capital, but also in their ability to access economic and social resources. The document highlights how the team's activities have helped players like Adama, a young Ivorian calciatore, to build relationships with locals and feel a sense of belonging in their new community.\\n\\nI was also impressed by the team's efforts to promote social capital and integration, particularly in the face of challenges such as discriminatory laws and practices. The document notes how the team has worked to overcome these obstacles, and how its players have benefited from the experience.\\n\\nAs I reflect on the insights from this document, I'm left with a question: how can we replicate the success of Afro-Napoli United in other contexts, and what role can football play in promoting social cohesion and integration in diverse communities?\\n\\nBased on work by Luca Bifulco and Adele Del Guercio (Publication Date Not Specified) #llm #sports #integration\"}\n",
|
16 |
+
"{'timestamp': '2025-06-19 13:56:16', 'event_type': 'feedback', 'metadata': {'sentiment': 'positive'}, 'content': \"I've read a fascinating document about the role of football in promoting integration and social capital among migrants. As someone who's passionate about understanding the complexities of social dynamics, I was particularly interested in the case study of Afro-Napoli United, a football team composed of migrant and Italian players.\\n\\nWhat struck me was the way the team's experience has influenced the lives of its players, not just in terms of their social capital, but also in their ability to access economic and social resources. The document highlights how the team's activities have helped players like Adama, a young Ivorian calciatore, to build relationships with locals and feel a sense of belonging in their new community.\\n\\nI was also impressed by the team's efforts to promote social capital and integration, particularly in the face of challenges such as discriminatory laws and practices. The document notes how the team has worked to overcome these obstacles, and how its players have benefited from the experience.\\n\\nAs I reflect on the insights from this document, I'm left with a question: how can we replicate the success of Afro-Napoli United in other contexts, and what role can football play in promoting social cohesion and integration in diverse communities?\\n\\nBased on work by Luca Bifulco and Adele Del Guercio (Publication Date Not Specified) #llm #sports #integration\"}\n"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
],
|
20 |
+
"source": [
|
21 |
+
"import json\n",
|
22 |
+
"import os\n",
|
23 |
+
"from datetime import datetime\n",
|
24 |
+
"\n",
|
25 |
+
"def read_jsonl_file(filepath):\n",
|
26 |
+
" \"\"\"Read a JSONL file and return a list of JSON objects.\"\"\"\n",
|
27 |
+
" data = []\n",
|
28 |
+
" with open(filepath, 'r', encoding='utf-8') as file:\n",
|
29 |
+
" for line in file:\n",
|
30 |
+
" try:\n",
|
31 |
+
" data.append(json.loads(line.strip()))\n",
|
32 |
+
" except json.JSONDecodeError:\n",
|
33 |
+
" print(f\"Error parsing line: {line}\")\n",
|
34 |
+
" continue\n",
|
35 |
+
" return data\n",
|
36 |
+
"\n",
|
37 |
+
"# Example usage\n",
|
38 |
+
"log_dir = \"/Users/pasqualesalomone/Downloads/\"\n",
|
39 |
+
"filename = \"analytics_logs_2025-06-19.jsonl\" # Replace with your filename\n",
|
40 |
+
"filepath = os.path.join(log_dir, filename)\n",
|
41 |
+
"\n",
|
42 |
+
"if os.path.exists(filepath):\n",
|
43 |
+
" data = read_jsonl_file(filepath)\n",
|
44 |
+
" for entry in data:\n",
|
45 |
+
" print(entry)\n",
|
46 |
+
"else:\n",
|
47 |
+
" print(f\"File not found: {filepath}\")\n"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 3,
|
53 |
+
"id": "82e61c49",
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [
|
56 |
+
{
|
57 |
+
"name": "stdout",
|
58 |
+
"output_type": "stream",
|
59 |
+
"text": [
|
60 |
+
"post_generator1\n"
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"source": [
|
65 |
+
"from google.cloud import storage\n",
|
66 |
+
"\n",
|
67 |
+
"client = storage.Client()\n",
|
68 |
+
"for bucket in client.list_buckets():\n",
|
69 |
+
" print(bucket.name)\n"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"id": "526ff54a",
|
76 |
+
"metadata": {},
|
77 |
+
"outputs": [],
|
78 |
+
"source": []
|
79 |
+
}
|
80 |
+
],
|
81 |
+
"metadata": {
|
82 |
+
"kernelspec": {
|
83 |
+
"display_name": "llmpdf-env",
|
84 |
+
"language": "python",
|
85 |
+
"name": "python3"
|
86 |
+
},
|
87 |
+
"language_info": {
|
88 |
+
"codemirror_mode": {
|
89 |
+
"name": "ipython",
|
90 |
+
"version": 3
|
91 |
+
},
|
92 |
+
"file_extension": ".py",
|
93 |
+
"mimetype": "text/x-python",
|
94 |
+
"name": "python",
|
95 |
+
"nbconvert_exporter": "python",
|
96 |
+
"pygments_lexer": "ipython3",
|
97 |
+
"version": "3.13.1"
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"nbformat": 4,
|
101 |
+
"nbformat_minor": 5
|
102 |
+
}
|
llm_integration.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from openai import OpenAI
|
5 |
+
from presidio_analyzer import AnalyzerEngine
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
INSTRUCTION_TEMPLATE = """
|
10 |
+
Generate a compelling LinkedIn post in a {tone} tone based on the PDF content provided, following these guidelines:
|
11 |
+
|
12 |
+
1. STYLE & TONE:
|
13 |
+
- Write in first-person perspective as someone who has personally read and been impacted by the document
|
14 |
+
- Use a conversational, thoughtful tone that reflects genuine interest in the topic
|
15 |
+
- Include 1-2 personal reflections or opinions that demonstrate engagement with the material
|
16 |
+
- Vary sentence structure and length to create natural rhythm and flow
|
17 |
+
|
18 |
+
2. STRUCTURE (1300-2000 characters):
|
19 |
+
- Start with an attention-grabbing opening that poses a question or shares a surprising insight
|
20 |
+
- Break content into 2-3 short paragraphs with strategic spacing for readability
|
21 |
+
- Include 1-2 specific facts or statistics from the document to establish credibility
|
22 |
+
- End with a thought-provoking question or call-to-action that encourages comments
|
23 |
+
|
24 |
+
3. CONTENT ELEMENTS:
|
25 |
+
- Mention authors and publication date naturally within the flow of text
|
26 |
+
- Reference that you've been reading/reviewing this document (without explicitly saying "PDF")
|
27 |
+
- Focus on 1-3 key takeaways rather than attempting to summarize everything
|
28 |
+
- Include your perspective on why these insights matter to your professional network
|
29 |
+
|
30 |
+
4. ATTRIBUTION & FORMATTING:
|
31 |
+
- Use 1-3 emojis maximum, placed strategically (not in succession)
|
32 |
+
- At the end of the post, include a clear attribution line with authors and publication date
|
33 |
+
- Follow the attribution with these hashtag related to the content; always include #llm
|
34 |
+
- Format example: "Based on work by [Authors] ([Publication Date]) #llm #sports #innovation"
|
35 |
+
- DO NOT include character counts, introductory phrases, or any meta-commentary
|
36 |
+
- DO NOT present as a formal summary or book report - write as a professional sharing valuable insights
|
37 |
+
|
38 |
+
The final post should read as if a thoughtful professional read something interesting and wanted to share their genuine takeaways with their network, while properly crediting the original authors.
|
39 |
+
"""
|
40 |
+
|
41 |
+
# Initialize the Presidio PII Analyzer
|
42 |
+
analyzer = AnalyzerEngine()
|
43 |
+
|
44 |
+
# Define which PII entities to check for
|
45 |
+
PII_ENTITIES_TO_CHECK = [
|
46 |
+
"EMAIL_ADDRESS",
|
47 |
+
"PHONE_NUMBER",
|
48 |
+
"CREDIT_CARD",
|
49 |
+
"US_SSN"
|
50 |
+
]
|
51 |
+
|
52 |
+
MIN_CONFIDENCE = 0.8 # Minimum confidence threshold for detected entities
|
53 |
+
|
54 |
+
def contains_pii(text: str) -> bool:
|
55 |
+
"""
|
56 |
+
Analyze the text for presence of specified PII entities above a confidence threshold.
|
57 |
+
Returns True if any PII entities are found, False otherwise.
|
58 |
+
"""
|
59 |
+
results = analyzer.analyze(text=text, entities=PII_ENTITIES_TO_CHECK, language='en')
|
60 |
+
high_confidence_results = [r for r in results if r.score >= MIN_CONFIDENCE]
|
61 |
+
if high_confidence_results:
|
62 |
+
# Debug: print detected PII entities with their type, snippet, and confidence score
|
63 |
+
print("Detected PII:", [(r.entity_type, text[r.start:r.end], r.score) for r in high_confidence_results])
|
64 |
+
return True
|
65 |
+
return False
|
66 |
+
|
67 |
+
def generate_linkedin_post(pdf_content: str, tone: str = "Professional", retry_num: int = 0) -> str:
|
68 |
+
api_key = os.getenv("OPENROUTER_API_KEY")
|
69 |
+
if not api_key:
|
70 |
+
raise ValueError("OPENROUTER_API_KEY environment variable is not set")
|
71 |
+
|
72 |
+
# PII detection before sending content to LLM
|
73 |
+
if contains_pii(pdf_content):
|
74 |
+
return (
|
75 |
+
"⚠️ The uploaded PDF appears to contain personal or sensitive information. "
|
76 |
+
"Please remove such details before generating a post."
|
77 |
+
)
|
78 |
+
|
79 |
+
try:
|
80 |
+
client = OpenAI(
|
81 |
+
base_url="https://openrouter.ai/api/v1",
|
82 |
+
api_key=api_key,
|
83 |
+
)
|
84 |
+
|
85 |
+
instruction = INSTRUCTION_TEMPLATE.format(tone=tone)
|
86 |
+
temperature = 0.7 + 0.1 * retry_num # Add variability on retries
|
87 |
+
|
88 |
+
response = client.chat.completions.create(
|
89 |
+
model="meta-llama/llama-3.3-8b-instruct:free",
|
90 |
+
messages=[
|
91 |
+
{"role": "system", "content": instruction},
|
92 |
+
{"role": "user", "content": f"PDF Content:\n{pdf_content}"}
|
93 |
+
],
|
94 |
+
temperature=temperature,
|
95 |
+
max_tokens=2000,
|
96 |
+
top_p=0.85,
|
97 |
+
stream=False,
|
98 |
+
)
|
99 |
+
|
100 |
+
if response and hasattr(response, "choices") and response.choices:
|
101 |
+
return response.choices[0].message.content.strip()
|
102 |
+
else:
|
103 |
+
raise RuntimeError("No content returned by the language model.")
|
104 |
+
|
105 |
+
except Exception as e:
|
106 |
+
return f"Error generating Social Media post: {str(e)}"
|
main.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
import os
|
3 |
+
import gradio as gr
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from pdf_processing import extract_pdf_content
|
6 |
+
from llm_integration import generate_linkedin_post
|
7 |
+
from analytics import log_analytics, is_too_similar
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# def process_pdf(file, tone, version):
|
12 |
+
# if file is None:
|
13 |
+
# return "Please upload a PDF file.", "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
14 |
+
|
15 |
+
# content = extract_pdf_content(file)
|
16 |
+
# if content.startswith("Error"):
|
17 |
+
# return content, "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
18 |
+
|
19 |
+
# max_attempts = 3
|
20 |
+
# for attempt in range(max_attempts):
|
21 |
+
# post = generate_linkedin_post(content, tone, retry_num=attempt)
|
22 |
+
# if not is_too_similar(post):
|
23 |
+
# log_analytics("generation", {"tone": tone, "version": version, "length": len(post)}, content=post)
|
24 |
+
# return post, f"Character count: {len(post)}", gr.update(visible=True), gr.update(visible=False)
|
25 |
+
|
26 |
+
# return "⚠️ Could not generate a unique post after 3 tries. Try changing the tone or the document.", "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
27 |
+
|
28 |
+
def process_pdf(file, tone, version):
|
29 |
+
if file is None:
|
30 |
+
return "Please upload a PDF file.", "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
31 |
+
|
32 |
+
content = extract_pdf_content(file)
|
33 |
+
if content.startswith("Error"):
|
34 |
+
return content, "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
35 |
+
|
36 |
+
max_attempts = 5
|
37 |
+
similarity_threshold = 0.7
|
38 |
+
|
39 |
+
for attempt in range(max_attempts):
|
40 |
+
post = generate_linkedin_post(content, tone, retry_num=attempt)
|
41 |
+
# Allow first post regardless of similarity
|
42 |
+
if attempt == 0 or not is_too_similar(post, threshold=similarity_threshold):
|
43 |
+
log_analytics("generation", {"tone": tone, "version": version, "length": len(post)}, content=post)
|
44 |
+
return post, f"Character count: {len(post)}", gr.update(visible=True), gr.update(visible=False)
|
45 |
+
|
46 |
+
return "⚠️ Could not generate a unique post after multiple tries. Try changing the tone or the document.", "Character count: 0", gr.update(visible=False), gr.update(visible=False)
|
47 |
+
|
48 |
+
def submit_feedback(post, sentiment, has_feedback):
|
49 |
+
if has_feedback:
|
50 |
+
return gr.update(visible=True, value="You've already provided feedback. Thank you!"), gr.update(visible=False), True
|
51 |
+
|
52 |
+
if not post or post.startswith("Please upload") or post.startswith("Error"):
|
53 |
+
return gr.update(visible=True, value="⚠️ No valid post to rate. Generate a post first!"), gr.update(visible=True), False
|
54 |
+
|
55 |
+
log_analytics("feedback", {"sentiment": sentiment}, content=post)
|
56 |
+
message = "Thank you for your feedback! 😊" if sentiment == "positive" else "Thank you for your feedback! We'll work to improve. 🙏"
|
57 |
+
return gr.update(visible=True, value=message), gr.update(visible=False), True
|
58 |
+
|
59 |
+
with gr.Blocks(title="PDF to Social Media Post Generator", css=".blue-button {background-color: #0A66C2; color: white;}") as app:
|
60 |
+
has_given_feedback = gr.State(False)
|
61 |
+
|
62 |
+
gr.Markdown("# 📄 PDF to Social Media Post Generator")
|
63 |
+
gr.Markdown("Upload a PDF document, choose tone and version, and generate a Social Media post.")
|
64 |
+
gr.Markdown(
|
65 |
+
"⚠️ **Important:** Uploaded PDFs will be scanned for sensitive data (names, emails, phone numbers, etc.) "
|
66 |
+
"before being sent to the LLM model. The app does not store any personal information."
|
67 |
+
)
|
68 |
+
|
69 |
+
with gr.Row():
|
70 |
+
with gr.Column():
|
71 |
+
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
72 |
+
tone_dropdown = gr.Dropdown(
|
73 |
+
label="Select Tone",
|
74 |
+
choices=["Professional", "Mario Bros Style", "Insightful", "Promotional"],
|
75 |
+
value="Professional"
|
76 |
+
)
|
77 |
+
version_dropdown = gr.Dropdown(
|
78 |
+
label="Select Version",
|
79 |
+
choices=[
|
80 |
+
"v1-Standard structure and tone",
|
81 |
+
"v2-Experimental with richer sentence variety and longer posts"
|
82 |
+
],
|
83 |
+
value="v1-Standard structure and tone"
|
84 |
+
)
|
85 |
+
generate_button = gr.Button("Generate Social Media Post", elem_classes="blue-button")
|
86 |
+
|
87 |
+
with gr.Column():
|
88 |
+
output_box = gr.Textbox(label="Generated Social Media Post", lines=15, show_copy_button=True)
|
89 |
+
char_count = gr.Markdown("Character count: 0")
|
90 |
+
|
91 |
+
with gr.Row(visible=False) as feedback_row:
|
92 |
+
gr.Markdown("### Was this post helpful?")
|
93 |
+
positive_btn = gr.Button("👍 Yes", variant="primary", size="sm")
|
94 |
+
negative_btn = gr.Button("👎 No", variant="secondary", size="sm")
|
95 |
+
|
96 |
+
feedback_status = gr.Markdown(visible=False)
|
97 |
+
|
98 |
+
# Hidden signals for feedback logic
|
99 |
+
positive_signal = gr.Textbox(value="positive", visible=False)
|
100 |
+
negative_signal = gr.Textbox(value="negative", visible=False)
|
101 |
+
|
102 |
+
generate_button.click(
|
103 |
+
fn=process_pdf,
|
104 |
+
inputs=[pdf_input, tone_dropdown, version_dropdown],
|
105 |
+
outputs=[output_box, char_count, feedback_row, feedback_status]
|
106 |
+
)
|
107 |
+
|
108 |
+
generate_button.click(fn=lambda: False, outputs=has_given_feedback)
|
109 |
+
|
110 |
+
positive_btn.click(
|
111 |
+
fn=submit_feedback,
|
112 |
+
inputs=[output_box, positive_signal, has_given_feedback],
|
113 |
+
outputs=[feedback_status, feedback_row, has_given_feedback]
|
114 |
+
)
|
115 |
+
|
116 |
+
negative_btn.click(
|
117 |
+
fn=submit_feedback,
|
118 |
+
inputs=[output_box, negative_signal, has_given_feedback],
|
119 |
+
outputs=[feedback_status, feedback_row, has_given_feedback]
|
120 |
+
)
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
app.launch(share=True)
|
pdf_processing.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import os
|
3 |
+
|
4 |
+
def extract_pdf_content(pdf_file):
|
5 |
+
"""
|
6 |
+
Extracts text content from a PDF file.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
pdf_file: The uploaded PDF file.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
str: Extracted text content from the PDF.
|
13 |
+
"""
|
14 |
+
if pdf_file is None:
|
15 |
+
return "No PDF file was uploaded. Please upload a PDF file."
|
16 |
+
|
17 |
+
try:
|
18 |
+
# Gradio file component returns the file path
|
19 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
20 |
+
text = ""
|
21 |
+
for page in pdf_reader.pages:
|
22 |
+
text += page.extract_text()
|
23 |
+
|
24 |
+
if not text.strip():
|
25 |
+
return "No text content could be extracted from the PDF. The file might be scanned or contain only images."
|
26 |
+
|
27 |
+
return text
|
28 |
+
except Exception as e:
|
29 |
+
return f"Error extracting text from PDF: {str(e)}"
|