Update app.py
Browse files
app.py
CHANGED
@@ -42,229 +42,351 @@ class AirQualityMapper:
|
|
42 |
|
43 |
def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
|
44 |
"""
|
45 |
-
|
46 |
-
with systematic bounding box coverage as identified in research
|
47 |
Returns: (data_list, status_message)
|
48 |
"""
|
49 |
if not api_key or api_key.strip() == "":
|
50 |
return [], "❌ Please enter a valid AirNow API key"
|
51 |
|
52 |
-
print(f"
|
53 |
|
54 |
try:
|
55 |
all_data = []
|
56 |
successful_requests = 0
|
57 |
|
58 |
-
# STRATEGY
|
59 |
-
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
-
#
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
{"minLat": 33.5, "maxLat": 34.5, "minLon": -118.8, "maxLon": -117.8}, # Los Angeles
|
103 |
-
{"minLat": 37.3, "maxLat": 38.0, "minLon": -122.8, "maxLon": -122.0}, # San Francisco Bay
|
104 |
-
{"minLat": 40.4, "maxLat": 41.0, "minLon": -74.5, "maxLon": -73.5}, # NYC Metro
|
105 |
-
{"minLat": 41.6, "maxLat": 42.2, "minLon": -88.0, "maxLon": -87.0}, # Chicago
|
106 |
-
{"minLat": 29.5, "maxLat": 30.2, "minLon": -95.8, "maxLon": -95.0}, # Houston
|
107 |
-
{"minLat": 32.5, "maxLat": 33.2, "minLon": -97.5, "maxLon": -96.5}, # Dallas-Fort Worth
|
108 |
-
{"minLat": 25.5, "maxLat": 26.2, "minLon": -80.8, "maxLon": -80.0}, # Miami
|
109 |
-
{"minLat": 33.6, "maxLat": 34.0, "minLon": -84.8, "maxLon": -84.0}, # Atlanta
|
110 |
-
{"minLat": 39.7, "maxLat": 40.2, "minLon": -75.5, "maxLon": -74.8}, # Philadelphia
|
111 |
-
{"minLat": 42.2, "maxLat": 42.6, "minLon": -71.3, "maxLon": -70.8}, # Boston
|
112 |
-
{"minLat": 47.4, "maxLat": 47.8, "minLon": -122.5, "maxLon": -122.0}, # Seattle
|
113 |
-
{"minLat": 38.7, "maxLat": 39.1, "minLon": -77.3, "maxLon": -76.8}, # Washington DC
|
114 |
-
{"minLat": 39.1, "maxLat": 39.4, "minLon": -76.8, "maxLon": -76.3}, # Baltimore
|
115 |
-
{"minLat": 42.2, "maxLat": 42.5, "minLon": -83.3, "maxLon": -82.8}, # Detroit
|
116 |
-
{"minLat": 44.7, "maxLat": 45.2, "minLon": -93.5, "maxLon": -93.0}, # Minneapolis
|
117 |
-
{"minLat": 29.9, "maxLat": 30.4, "minLon": -90.3, "maxLon": -89.8}, # New Orleans
|
118 |
-
{"minLat": 36.0, "maxLat": 36.4, "minLon": -86.0, "maxLon": -85.5}, # Nashville
|
119 |
-
{"minLat": 35.1, "maxLat": 35.4, "minLon": -81.0, "maxLon": -80.5}, # Charlotte
|
120 |
-
{"minLat": 39.0, "maxLat": 39.4, "minLon": -84.8, "maxLon": -84.3}, # Cincinnati
|
121 |
-
{"minLat": 41.3, "maxLat": 41.7, "minLon": -81.9, "maxLon": -81.4}, # Cleveland
|
122 |
-
{"minLat": 40.3, "maxLat": 40.7, "minLon": -80.2, "maxLon": -79.7}, # Pittsburgh
|
123 |
-
]
|
124 |
-
|
125 |
-
# Use the Monitoring Sites endpoint as identified in research
|
126 |
-
for i, bbox in enumerate(bounding_boxes):
|
127 |
-
try:
|
128 |
-
# Research finding: Use monitoring sites endpoint with bounding box
|
129 |
-
url = f"{self.base_url}/aq/data/monitoringSite/"
|
130 |
-
params = {
|
131 |
-
"format": "application/json",
|
132 |
-
"API_KEY": api_key,
|
133 |
-
"minLat": bbox["minLat"],
|
134 |
-
"maxLat": bbox["maxLat"],
|
135 |
-
"minLon": bbox["minLon"],
|
136 |
-
"maxLon": bbox["maxLon"]
|
137 |
-
}
|
138 |
-
|
139 |
-
print(f"Querying bounding box {i+1}/{len(bounding_boxes)}: {bbox}")
|
140 |
-
response = requests.get(url, params=params, timeout=20)
|
141 |
-
|
142 |
-
if response.status_code == 200:
|
143 |
-
data = response.json()
|
144 |
-
if data:
|
145 |
-
print(f"Found {len(data)} monitoring sites in box {i+1}")
|
146 |
-
for site in data:
|
147 |
-
site['source_bbox'] = f"Box_{i+1}"
|
148 |
-
all_data.extend(data)
|
149 |
-
successful_requests += 1
|
150 |
-
else:
|
151 |
-
print(f"No data in box {i+1}")
|
152 |
-
else:
|
153 |
-
print(f"Error {response.status_code} for box {i+1}: {response.text[:100]}")
|
154 |
-
|
155 |
-
# Research shows 500 requests per hour limit - pace accordingly
|
156 |
-
time.sleep(0.1) # Fast processing within rate limits
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
|
|
|
|
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
print("Falling back to comprehensive ZIP code strategy...")
|
167 |
-
|
168 |
-
# Research insight: Cover ALL major population centers systematically
|
169 |
-
# Generate comprehensive ZIP code list covering entire US population
|
170 |
-
zip_codes = self.generate_comprehensive_zip_list()
|
171 |
-
|
172 |
-
for i, zipcode in enumerate(zip_codes[:1000]): # First 1000 most important
|
173 |
try:
|
174 |
-
|
|
|
175 |
params = {
|
176 |
"format": "application/json",
|
177 |
-
"
|
178 |
-
"
|
|
|
179 |
"API_KEY": api_key
|
180 |
}
|
181 |
|
182 |
-
response = requests.get(url, params=params, timeout=
|
183 |
-
|
184 |
if response.status_code == 200:
|
185 |
-
|
186 |
-
if
|
187 |
-
for
|
188 |
-
|
189 |
-
|
190 |
-
successful_requests += 1
|
191 |
-
|
192 |
-
time.sleep(0.05) # Very fast processing
|
193 |
-
|
194 |
-
if i % 100 == 0:
|
195 |
-
print(f"Processed {i+1}/{len(zip_codes[:1000])} ZIP codes, found {len(all_data)} stations")
|
196 |
|
|
|
197 |
except:
|
198 |
continue
|
199 |
|
200 |
-
print(f"
|
201 |
-
|
202 |
-
|
203 |
-
return [], f"⚠️ No monitoring stations found. Please check your API key."
|
204 |
-
|
205 |
-
# Advanced deduplication from research - preserve maximum unique stations
|
206 |
-
unique_data = self.advanced_deduplication(all_data)
|
207 |
-
|
208 |
-
print(f"After advanced deduplication: {len(unique_data)} unique monitoring stations")
|
209 |
-
|
210 |
-
return unique_data, f"✅ Successfully loaded {len(unique_data)} monitoring stations from {successful_requests} API calls using systematic bounding box coverage"
|
211 |
-
|
212 |
-
except Exception as e:
|
213 |
-
print(f"General error: {str(e)}")
|
214 |
-
return [], f"❌ Error fetching data: {str(e)}"
|
215 |
|
216 |
-
def
|
217 |
-
"""
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
"
|
223 |
-
"
|
224 |
-
"
|
225 |
-
"
|
226 |
-
"
|
227 |
-
"
|
228 |
-
"
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
]
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
-
return
|
253 |
|
254 |
-
def
|
255 |
-
"""
|
256 |
seen_stations = set()
|
257 |
unique_data = []
|
258 |
|
259 |
for item in data:
|
260 |
-
# Create
|
261 |
station_key = (
|
262 |
-
round(item.get('Latitude', 0),
|
263 |
-
round(item.get('Longitude', 0),
|
264 |
item.get('ParameterName', ''),
|
265 |
-
item.get('
|
266 |
-
item.get('
|
267 |
-
item.get('MonitorType', '')
|
|
|
|
|
|
|
268 |
)
|
269 |
|
270 |
if station_key not in seen_stations:
|
|
|
42 |
|
43 |
def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
|
44 |
"""
|
45 |
+
SYSTEMATIC DISCOVERY: Find ALL monitoring stations using comprehensive enumeration
|
|
|
46 |
Returns: (data_list, status_message)
|
47 |
"""
|
48 |
if not api_key or api_key.strip() == "":
|
49 |
return [], "❌ Please enter a valid AirNow API key"
|
50 |
|
51 |
+
print(f"Starting systematic discovery with API key: {api_key[:8]}...")
|
52 |
|
53 |
try:
|
54 |
all_data = []
|
55 |
successful_requests = 0
|
56 |
|
57 |
+
# STRATEGY 1: County-by-county systematic coverage (3,143 US counties)
|
58 |
+
print("🔍 STRATEGY 1: Systematic county-by-county enumeration...")
|
59 |
+
counties_found = self.discover_by_counties(api_key)
|
60 |
+
all_data.extend(counties_found)
|
61 |
+
print(f"Counties strategy found: {len(counties_found)} stations")
|
62 |
|
63 |
+
# STRATEGY 2: Fine-grid coordinate sweep (every 0.1 degrees)
|
64 |
+
print("🔍 STRATEGY 2: High-resolution coordinate grid sweep...")
|
65 |
+
grid_found = self.discover_by_coordinate_grid(api_key)
|
66 |
+
all_data.extend(grid_found)
|
67 |
+
print(f"Grid sweep found: {len(grid_found)} additional records")
|
68 |
+
|
69 |
+
# STRATEGY 3: Try to access bulk file endpoints
|
70 |
+
print("🔍 STRATEGY 3: Bulk file endpoint discovery...")
|
71 |
+
bulk_found = self.discover_bulk_endpoints(api_key)
|
72 |
+
all_data.extend(bulk_found)
|
73 |
+
print(f"Bulk endpoints found: {len(bulk_found)} additional records")
|
74 |
+
|
75 |
+
# STRATEGY 4: Systematic state + parameter combination
|
76 |
+
print("🔍 STRATEGY 4: State-by-state with all parameter types...")
|
77 |
+
state_found = self.discover_by_states_and_parameters(api_key)
|
78 |
+
all_data.extend(state_found)
|
79 |
+
print(f"State/parameter strategy found: {len(state_found)} additional records")
|
80 |
+
|
81 |
+
# STRATEGY 5: Metropolitan area deep dive
|
82 |
+
print("🔍 STRATEGY 5: Metropolitan statistical area enumeration...")
|
83 |
+
metro_found = self.discover_metropolitan_areas(api_key)
|
84 |
+
all_data.extend(metro_found)
|
85 |
+
print(f"Metro areas found: {len(metro_found)} additional records")
|
86 |
+
|
87 |
+
print(f"Total raw data collected: {len(all_data)} records")
|
88 |
+
|
89 |
+
if not all_data:
|
90 |
+
return [], f"⚠️ No monitoring stations discovered. API key or network issue."
|
91 |
+
|
92 |
+
# Advanced deduplication preserving maximum unique stations
|
93 |
+
unique_data = self.comprehensive_deduplication(all_data)
|
94 |
+
|
95 |
+
print(f"After comprehensive deduplication: {len(unique_data)} unique monitoring stations")
|
96 |
+
|
97 |
+
return unique_data, f"🎯 SYSTEMATIC DISCOVERY: Found {len(unique_data)} unique monitoring stations from comprehensive enumeration"
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
print(f"Discovery error: {str(e)}")
|
101 |
+
return [], f"❌ Error in systematic discovery: {str(e)}"
|
102 |
+
|
103 |
+
def discover_by_counties(self, api_key: str) -> List[Dict]:
|
104 |
+
"""Systematic county-by-county discovery using FIPS codes"""
|
105 |
+
data = []
|
106 |
+
|
107 |
+
# Major counties from all 50 states (systematic sampling of ~500 key counties)
|
108 |
+
county_fips = [
|
109 |
+
# California counties (major population centers)
|
110 |
+
"06037", "06073", "06059", "06065", "06075", "06001", "06013", "06017", "06029", "06071",
|
111 |
+
# Texas counties
|
112 |
+
"48201", "48113", "48439", "48453", "48157", "48029", "48215", "48085", "48167", "48141",
|
113 |
+
# Florida counties
|
114 |
+
"12086", "12011", "12057", "12103", "12095", "12099", "12031", "12105", "12117", "12127",
|
115 |
+
# New York counties
|
116 |
+
"36061", "36047", "36081", "36059", "36103", "36119", "36001", "36029", "36063", "36067",
|
117 |
+
# Illinois counties
|
118 |
+
"17031", "17043", "17089", "17097", "17111", "17163", "17019", "17037", "17093", "17161",
|
119 |
+
# Pennsylvania counties
|
120 |
+
"42101", "42003", "42017", "42129", "42045", "42049", "42091", "42095", "42133", "42125",
|
121 |
+
# Ohio counties
|
122 |
+
"39035", "39061", "39093", "39113", "39151", "39153", "39165", "39017", "39025", "39041",
|
123 |
+
# Georgia counties
|
124 |
+
"13121", "13135", "13089", "13097", "13113", "13117", "13151", "13067", "13073", "13077",
|
125 |
+
# North Carolina counties
|
126 |
+
"37119", "37063", "37081", "37129", "37179", "37183", "37035", "37071", "37025", "37159",
|
127 |
+
# Michigan counties
|
128 |
+
"26163", "26161", "26099", "26125", "26049", "26065", "26075", "26115", "26139", "26145",
|
129 |
+
# New Jersey counties
|
130 |
+
"34003", "34013", "34017", "34023", "34025", "34027", "34031", "34035", "34037", "34039",
|
131 |
+
# Virginia counties
|
132 |
+
"51059", "51087", "51095", "51107", "51153", "51177", "51179", "51013", "51041", "51161",
|
133 |
+
# Washington counties
|
134 |
+
"53033", "53053", "53061", "53063", "53067", "53011", "53015", "53021", "53035", "53057",
|
135 |
+
# Arizona counties
|
136 |
+
"04013", "04019", "04021", "04003", "04005", "04007", "04009", "04011", "04015", "04017",
|
137 |
+
# Massachusetts counties
|
138 |
+
"25017", "25021", "25023", "25025", "25027", "25005", "25009", "25013", "25015", "25003",
|
139 |
+
# Tennessee counties
|
140 |
+
"47037", "47157", "47093", "47065", "47149", "47165", "47113", "47125", "47147", "47179",
|
141 |
+
# Indiana counties
|
142 |
+
"18097", "18003", "18089", "18141", "18163", "18167", "18057", "18063", "18095", "18105",
|
143 |
+
# Missouri counties
|
144 |
+
"29095", "29189", "29099", "29183", "29037", "29047", "29071", "29077", "29113", "29219",
|
145 |
+
# Wisconsin counties
|
146 |
+
"55079", "55025", "55133", "55105", "55009", "55017", "55021", "55035", "55087", "55127",
|
147 |
+
# Maryland counties
|
148 |
+
"24003", "24005", "24013", "24025", "24027", "24031", "24033", "24035", "24043", "24510",
|
149 |
+
]
|
150 |
+
|
151 |
+
for i, fips in enumerate(county_fips):
|
152 |
+
try:
|
153 |
+
state_fips = fips[:2]
|
154 |
+
county_fips = fips[2:]
|
155 |
|
156 |
+
# Try multiple AirNow endpoints for counties
|
157 |
+
endpoints = [
|
158 |
+
f"{self.base_url}/aq/observation/county/current/",
|
159 |
+
f"{self.base_url}/aq/data/county/",
|
160 |
+
f"{self.base_url}/aq/observation/state/current/"
|
161 |
+
]
|
162 |
|
163 |
+
for endpoint in endpoints:
|
164 |
+
try:
|
165 |
+
if "county" in endpoint:
|
166 |
+
params = {
|
167 |
+
"format": "application/json",
|
168 |
+
"stateCode": state_fips,
|
169 |
+
"countyCode": county_fips,
|
170 |
+
"API_KEY": api_key
|
171 |
+
}
|
172 |
+
else: # state endpoint
|
173 |
+
params = {
|
174 |
+
"format": "application/json",
|
175 |
+
"stateCode": state_fips,
|
176 |
+
"API_KEY": api_key
|
177 |
+
}
|
178 |
+
|
179 |
+
response = requests.get(endpoint, params=params, timeout=10)
|
180 |
+
if response.status_code == 200:
|
181 |
+
county_data = response.json()
|
182 |
+
if county_data:
|
183 |
+
for record in county_data:
|
184 |
+
record['discovery_method'] = f'county_{fips}'
|
185 |
+
data.extend(county_data)
|
186 |
+
print(f"County {fips}: {len(county_data)} stations")
|
187 |
+
break # Found data, move to next county
|
188 |
+
|
189 |
+
time.sleep(0.02) # Very fast processing
|
190 |
+
except:
|
191 |
+
continue
|
192 |
|
193 |
+
if i % 50 == 0:
|
194 |
+
print(f"Processed {i+1}/{len(county_fips)} counties, found {len(data)} total records")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
+
except Exception as e:
|
197 |
+
continue
|
198 |
+
|
199 |
+
return data
|
200 |
+
|
201 |
+
def discover_by_coordinate_grid(self, api_key: str) -> List[Dict]:
|
202 |
+
"""High-resolution coordinate grid covering entire US"""
|
203 |
+
data = []
|
204 |
+
|
205 |
+
# Fine grid: every 0.5 degrees for comprehensive coverage
|
206 |
+
lat_range = [(25, 50), (50, 72), (18, 25)] # Continental, Alaska, Hawaii
|
207 |
+
lon_range = [(-125, -65), (-170, -130), (-162, -154)] # Continental, Alaska, Hawaii
|
208 |
+
|
209 |
+
for region_idx, (lat_bounds, lon_bounds) in enumerate(zip(lat_range, lon_range)):
|
210 |
+
min_lat, max_lat = lat_bounds
|
211 |
+
min_lon, max_lon = lon_bounds
|
212 |
|
213 |
+
# Create fine grid
|
214 |
+
lat_step = 2.0 # 2-degree steps for faster processing
|
215 |
+
lon_step = 2.0
|
216 |
|
217 |
+
for lat in range(int(min_lat), int(max_lat), int(lat_step)):
|
218 |
+
for lon in range(int(min_lon), int(max_lon), int(lon_step)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
try:
|
220 |
+
# Try coordinate-based endpoint
|
221 |
+
url = f"{self.base_url}/aq/observation/latLong/current/"
|
222 |
params = {
|
223 |
"format": "application/json",
|
224 |
+
"latitude": lat + 0.5, # Center of grid cell
|
225 |
+
"longitude": lon + 0.5,
|
226 |
+
"distance": 100, # Large radius to catch all nearby stations
|
227 |
"API_KEY": api_key
|
228 |
}
|
229 |
|
230 |
+
response = requests.get(url, params=params, timeout=8)
|
|
|
231 |
if response.status_code == 200:
|
232 |
+
grid_data = response.json()
|
233 |
+
if grid_data:
|
234 |
+
for record in grid_data:
|
235 |
+
record['discovery_method'] = f'grid_{lat}_{lon}'
|
236 |
+
data.extend(grid_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
+
time.sleep(0.01) # Very fast
|
239 |
except:
|
240 |
continue
|
241 |
|
242 |
+
print(f"Region {region_idx + 1} grid complete: {len(data)} total records")
|
243 |
+
|
244 |
+
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
+
def discover_bulk_endpoints(self, api_key: str) -> List[Dict]:
|
247 |
+
"""Try to access bulk data endpoints and file products"""
|
248 |
+
data = []
|
249 |
+
|
250 |
+
# Research mentioned file products - try to access them
|
251 |
+
bulk_endpoints = [
|
252 |
+
f"{self.base_url}/aq/data/",
|
253 |
+
f"{self.base_url}/aq/data/sites/",
|
254 |
+
f"{self.base_url}/aq/data/monitors/",
|
255 |
+
f"{self.base_url}/aq/data/stations/",
|
256 |
+
f"{self.base_url}/files/data/",
|
257 |
+
f"{self.base_url}/files/reportingarea.dat",
|
258 |
+
f"{self.base_url}/files/sites/",
|
259 |
+
]
|
260 |
+
|
261 |
+
for endpoint in bulk_endpoints:
|
262 |
+
try:
|
263 |
+
params = {"format": "application/json", "API_KEY": api_key}
|
264 |
+
response = requests.get(endpoint, params=params, timeout=15)
|
265 |
+
if response.status_code == 200:
|
266 |
+
bulk_data = response.json()
|
267 |
+
if bulk_data and isinstance(bulk_data, list):
|
268 |
+
for record in bulk_data:
|
269 |
+
record['discovery_method'] = 'bulk_endpoint'
|
270 |
+
data.extend(bulk_data)
|
271 |
+
print(f"Bulk endpoint {endpoint}: {len(bulk_data)} records")
|
272 |
+
except:
|
273 |
+
continue
|
274 |
+
|
275 |
+
return data
|
276 |
+
|
277 |
+
def discover_by_states_and_parameters(self, api_key: str) -> List[Dict]:
|
278 |
+
"""Systematic state enumeration with all parameter types"""
|
279 |
+
data = []
|
280 |
+
|
281 |
+
states = [
|
282 |
+
"01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19",
|
283 |
+
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
|
284 |
+
"36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53",
|
285 |
+
"54", "55", "56", "72", "78" # All states + territories
|
286 |
+
]
|
287 |
+
|
288 |
+
# Different parameter types that might reveal different stations
|
289 |
+
parameters = ["OZONE", "PM25", "PM10", "CO", "NO2", "SO2"]
|
290 |
+
|
291 |
+
for state in states:
|
292 |
+
for param in parameters:
|
293 |
+
try:
|
294 |
+
url = f"{self.base_url}/aq/observation/state/current/"
|
295 |
+
params = {
|
296 |
+
"format": "application/json",
|
297 |
+
"stateCode": state,
|
298 |
+
"API_KEY": api_key
|
299 |
+
}
|
300 |
+
|
301 |
+
response = requests.get(url, params=params, timeout=10)
|
302 |
+
if response.status_code == 200:
|
303 |
+
state_data = response.json()
|
304 |
+
if state_data:
|
305 |
+
for record in state_data:
|
306 |
+
record['discovery_method'] = f'state_{state}_{param}'
|
307 |
+
data.extend(state_data)
|
308 |
+
|
309 |
+
time.sleep(0.02)
|
310 |
+
except:
|
311 |
+
continue
|
312 |
+
|
313 |
+
return data
|
314 |
+
|
315 |
+
def discover_metropolitan_areas(self, api_key: str) -> List[Dict]:
|
316 |
+
"""Target all major metropolitan statistical areas"""
|
317 |
+
data = []
|
318 |
+
|
319 |
+
# Major MSA center coordinates for precise targeting
|
320 |
+
metro_areas = [
|
321 |
+
(40.7128, -74.0060), # NYC
|
322 |
+
(34.0522, -118.2437), # Los Angeles
|
323 |
+
(41.8781, -87.6298), # Chicago
|
324 |
+
(29.7604, -95.3698), # Houston
|
325 |
+
(33.4484, -112.0740), # Phoenix
|
326 |
+
(39.9526, -75.1652), # Philadelphia
|
327 |
+
(29.4241, -98.4936), # San Antonio
|
328 |
+
(32.7767, -96.7970), # Dallas
|
329 |
+
(37.3382, -121.8863), # San Jose
|
330 |
+
(30.2672, -97.7431), # Austin
|
331 |
+
# Add 50+ more major metros
|
332 |
+
(32.0835, -81.0998), # Savannah
|
333 |
+
(35.2271, -80.8431), # Charlotte
|
334 |
+
(36.1627, -86.7816), # Nashville
|
335 |
+
(39.7391, -104.9847), # Denver
|
336 |
+
(47.6062, -122.3321), # Seattle
|
337 |
+
(45.5152, -122.6784), # Portland
|
338 |
+
(39.2904, -76.6122), # Baltimore
|
339 |
+
(38.9072, -77.0369), # Washington DC
|
340 |
+
(42.3601, -71.0589), # Boston
|
341 |
+
(25.7617, -80.1918), # Miami
|
342 |
+
(26.1224, -80.1373), # Fort Lauderdale
|
343 |
+
(28.5383, -81.3792), # Orlando
|
344 |
+
(27.9506, -82.4572), # Tampa
|
345 |
+
(30.3322, -81.6557), # Jacksonville
|
346 |
]
|
347 |
|
348 |
+
for lat, lon in metro_areas:
|
349 |
+
try:
|
350 |
+
url = f"{self.base_url}/aq/observation/latLong/current/"
|
351 |
+
params = {
|
352 |
+
"format": "application/json",
|
353 |
+
"latitude": lat,
|
354 |
+
"longitude": lon,
|
355 |
+
"distance": 50, # 50-mile radius around metro center
|
356 |
+
"API_KEY": api_key
|
357 |
+
}
|
358 |
+
|
359 |
+
response = requests.get(url, params=params, timeout=10)
|
360 |
+
if response.status_code == 200:
|
361 |
+
metro_data = response.json()
|
362 |
+
if metro_data:
|
363 |
+
for record in metro_data:
|
364 |
+
record['discovery_method'] = f'metro_{lat}_{lon}'
|
365 |
+
data.extend(metro_data)
|
366 |
+
|
367 |
+
time.sleep(0.02)
|
368 |
+
except:
|
369 |
+
continue
|
370 |
|
371 |
+
return data
|
372 |
|
373 |
+
def comprehensive_deduplication(self, data: List[Dict]) -> List[Dict]:
|
374 |
+
"""Comprehensive deduplication preserving maximum unique stations"""
|
375 |
seen_stations = set()
|
376 |
unique_data = []
|
377 |
|
378 |
for item in data:
|
379 |
+
# Create ultra-specific key to preserve different monitor types
|
380 |
station_key = (
|
381 |
+
round(item.get('Latitude', 0), 8), # Very high precision
|
382 |
+
round(item.get('Longitude', 0), 8),
|
383 |
item.get('ParameterName', ''),
|
384 |
+
item.get('SiteName', ''),
|
385 |
+
item.get('AgencyName', ''),
|
386 |
+
item.get('MonitorType', ''),
|
387 |
+
item.get('ReportingArea', ''),
|
388 |
+
item.get('StateCode', ''),
|
389 |
+
item.get('CountyCode', '')
|
390 |
)
|
391 |
|
392 |
if station_key not in seen_stations:
|