Dooratre commited on
Commit
43fd6f4
·
verified ·
1 Parent(s): 6d047e5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -0
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from urllib.parse import urlparse
6
+ from flask import Flask, jsonify, Response
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+ from datetime import datetime
9
+
10
+ # Import GitHub JSON DB helpers
11
+ from db_twiter import fetch_authenticity_token_and_commit_oid, update_user_json_file, fetch_json_from_github
12
+
13
+ BROWSE_ENDPOINT = "https://corvo-ai-xx-pg.hf.space/browse"
14
+ TWIIIT_REDIRECT = "https://twiiit.com/get-location"
15
+ DEFAULT_NITTER_DOMAIN = "nitter.net"
16
+
17
+ def get_current_nitter_domain(timeout=10):
18
+ try:
19
+ resp = requests.get(TWIIIT_REDIRECT, allow_redirects=False, timeout=timeout)
20
+ location = resp.headers.get("Location")
21
+ if location:
22
+ domain = urlparse(location).netloc
23
+ if domain:
24
+ return domain
25
+ except requests.RequestException:
26
+ pass
27
+ return DEFAULT_NITTER_DOMAIN
28
+
29
+ def build_nitter_profile_url(domain: str, username: str) -> str:
30
+ username = username.lstrip("@").strip()
31
+ return f"https://{domain}/{username}"
32
+
33
+ def browse(urls, wait_for=350):
34
+ payload = {"urls": urls, "wait_for": wait_for}
35
+ headers = {"Content-Type": "application/json"}
36
+ resp = requests.post(BROWSE_ENDPOINT, json=payload, headers=headers, timeout=120)
37
+ resp.raise_for_status()
38
+ try:
39
+ return resp.json()
40
+ except ValueError:
41
+ return {"raw": resp.text}
42
+
43
+ def strip_markdown_links(text: str) -> str:
44
+ text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
45
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
46
+ return text
47
+
48
+ def html_to_text(s: str) -> str:
49
+ soup = BeautifulSoup(s, "html.parser")
50
+ for br in soup.find_all(["br"]):
51
+ br.replace_with("\n")
52
+ text = soup.get_text(separator="\n")
53
+ return text
54
+
55
+ def clean_text(s: str) -> str:
56
+ s = strip_markdown_links(s)
57
+ s = html_to_text(s)
58
+ s = s.replace("\\\\", "\\")
59
+ s = re.sub(r'[ \t]+', ' ', s)
60
+ s = re.sub(r'\n{3,}', '\n\n', s)
61
+ s = s.strip()
62
+ return s
63
+
64
+ def extract_text_from_response(data: dict) -> str:
65
+ parts = []
66
+ try:
67
+ results = data["output"]["results"]
68
+ except (KeyError, TypeError):
69
+ return clean_text(str(data))
70
+ for r in results:
71
+ content = r.get("content", "")
72
+ if not content:
73
+ continue
74
+ parts.append(clean_text(content))
75
+ return "\n\n".join(p for p in parts if p)
76
+
77
+ def parse_posts(raw_text: str):
78
+ lines = [ln.strip() for ln in raw_text.splitlines()]
79
+ name_handle_re = re.compile(r'^(.+?)\s+@([A-Za-z0-9_]+)\s*$')
80
+ link_block_re = re.compile(r'^\[\]\(https?://')
81
+
82
+ def is_recent_time(s: str) -> bool:
83
+ return bool(re.fullmatch(r'\d+\s*[hmsHMS]', s))
84
+
85
+ posts = []
86
+ i = 0
87
+ n = len(lines)
88
+
89
+ while i < n:
90
+ m = name_handle_re.match(lines[i])
91
+ if not m:
92
+ i += 1
93
+ continue
94
+
95
+ name = m.group(1).strip()
96
+ handle = '@' + m.group(2).strip()
97
+
98
+ i += 1
99
+ while i < n and lines[i] == '':
100
+ i += 1
101
+ if i >= n:
102
+ break
103
+
104
+ publish_time = lines[i]
105
+ if not is_recent_time(publish_time):
106
+ i += 1
107
+ continue
108
+
109
+ i += 1
110
+
111
+ body_lines = []
112
+ while i < n:
113
+ line = lines[i]
114
+ if name_handle_re.match(line):
115
+ break
116
+ if link_block_re.match(line):
117
+ break
118
+ body_lines.append(line)
119
+ i += 1
120
+
121
+ def strip_trailing_counters(lines_in):
122
+ j = len(lines_in) - 1
123
+ count = 0
124
+ while j >= 0 and re.fullmatch(r'\d{1,4}(,\d{3})*', lines_in[j]):
125
+ count += 1
126
+ j -= 1
127
+ if count >= 5:
128
+ break
129
+ if count >= 2:
130
+ return lines_in[:j+1]
131
+ return lines_in
132
+
133
+ body_lines = strip_trailing_counters([ln for ln in body_lines if ln != ''])
134
+ body = "\n".join(ln for ln in body_lines).strip()
135
+
136
+ if body:
137
+ posts.append({
138
+ "name": name,
139
+ "handle": handle,
140
+ "time": publish_time,
141
+ "post": body
142
+ })
143
+
144
+ while i < n and link_block_re.match(lines[i]):
145
+ i += 1
146
+
147
+ return posts
148
+
149
+ def format_posts(posts):
150
+ sep = "-" * 50
151
+ out = []
152
+ for idx, p in enumerate(posts):
153
+ out.append(f"{p['name']} {p['handle']}")
154
+ out.append(p['time'])
155
+ out.append(p['post'])
156
+ if idx != len(posts) - 1:
157
+ out.append(sep)
158
+ return "\n".join(out).strip()
159
+
160
+ def fetch_posts_for_user(domain: str, username: str, wait_for=350):
161
+ url = build_nitter_profile_url(domain, username)
162
+ resp = browse([url], wait_for=wait_for)
163
+ text = extract_text_from_response(resp)
164
+ posts = parse_posts(text)
165
+ return posts
166
+
167
+ def build_output_text():
168
+ # 15+ accounts
169
+ usernames = [
170
+ "zerohedge",
171
+ "lisaabramowicz1",
172
+ "elerianm",
173
+ "jsblokland",
174
+ "AndreasSteno",
175
+ "charliebilello",
176
+ "GameofTrades_",
177
+ "SantiagoAuFund",
178
+ "DylanLeClair_",
179
+ "Ole_S_Hansen",
180
+ "NickTimiraos",
181
+ "federalreserve",
182
+ "POTUS",
183
+ "WhiteHouse",
184
+ "USTreasury",
185
+ "Reuters",
186
+ "BloombergTV",
187
+ ]
188
+
189
+ domain = get_current_nitter_domain()
190
+
191
+ outputs = []
192
+ for uname in usernames:
193
+ uname_clean = uname.lstrip("@")
194
+ try:
195
+ posts = fetch_posts_for_user(domain, uname_clean, wait_for=350)
196
+ header = f"=== @{uname_clean} ==="
197
+ if posts:
198
+ outputs.append(header)
199
+ outputs.append(format_posts(posts))
200
+ else:
201
+ outputs.append(f"{header}\nNo recent posts found.")
202
+ except Exception as e:
203
+ outputs.append(f"=== @{uname_clean} ===\nError: {e}")
204
+
205
+ # Append timestamp footer to help trace runs
206
+ outputs.append("")
207
+ outputs.append(f"Last update: {datetime.utcnow().isoformat()}Z")
208
+
209
+ return "\n\n".join(outputs)
210
+
211
+ def save_to_github_twiter_json(text_output: str):
212
+ # We store as JSON string, e.g. {"twiter": "<text here>"}
213
+ payload_obj = {"twiter": text_output}
214
+ new_content = json.dumps(payload_obj, ensure_ascii=False)
215
+
216
+ token, commit_oid = fetch_authenticity_token_and_commit_oid()
217
+ if not token or not commit_oid:
218
+ return {"success": False, "message": "Failed to retrieve authenticity token or commit OID."}
219
+
220
+ res = update_user_json_file(token, commit_oid, new_content)
221
+ return res
222
+
223
+ # Job that runs to fetch and save
224
+ def run_job():
225
+ try:
226
+ text = build_output_text()
227
+ result = save_to_github_twiter_json(text)
228
+ return result
229
+ except Exception as e:
230
+ return {"success": False, "message": f"Job failed: {e}"}
231
+
232
+ # Flask app
233
+ app = Flask(__name__)
234
+
235
+ @app.route("/", methods=["GET"])
236
+ def index():
237
+ # Return the latest content from GitHub
238
+ data = fetch_json_from_github()
239
+ if data.get("success") and isinstance(data.get("data"), dict):
240
+ tw_text = data["data"].get("twiter", "")
241
+ return Response(tw_text, mimetype="text/plain; charset=utf-8")
242
+ return jsonify(data), 500
243
+
244
+ @app.route("/run", methods=["POST"])
245
+ def run_now():
246
+ res = run_job()
247
+ status = 200 if res.get("success") else 500
248
+ return jsonify(res), status
249
+
250
+ def schedule_jobs():
251
+ scheduler = BackgroundScheduler(timezone="UTC")
252
+
253
+ # Run at minute 0 and 30 of every hour (UTC)
254
+ scheduler.add_job(run_job, "cron", minute="0,30", id="twiter_fetch_save")
255
+
256
+ scheduler.start()
257
+ return scheduler
258
+
259
+ if __name__ == "__main__":
260
+ # Start scheduler
261
+ schedule_jobs()
262
+
263
+ # Optional: run once on startup
264
+ try:
265
+ run_job()
266
+ except Exception:
267
+ pass
268
+
269
+ # Run Flask on port 7860
270
+ app.run(host="0.0.0.0", port=7860)