nakas commited on
Commit
3ebeb98
·
verified ·
1 Parent(s): be54199

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -184
app.py CHANGED
@@ -42,229 +42,351 @@ class AirQualityMapper:
42
 
43
  def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
44
  """
45
- Fetch ALL air quality monitoring stations using the Monitoring Sites endpoint
46
- with systematic bounding box coverage as identified in research
47
  Returns: (data_list, status_message)
48
  """
49
  if not api_key or api_key.strip() == "":
50
  return [], "❌ Please enter a valid AirNow API key"
51
 
52
- print(f"Using API key: {api_key[:8]}..." if len(api_key) > 8 else "API key too short")
53
 
54
  try:
55
  all_data = []
56
  successful_requests = 0
57
 
58
- # STRATEGY FROM RESEARCH: Use Monitoring Sites endpoint with bounding box queries
59
- # This bypasses the reporting area aggregation limitation
60
- print("Using Monitoring Sites endpoint with systematic bounding box coverage...")
 
 
61
 
62
- # Create systematic bounding box grid covering entire continental US + Alaska + Hawaii
63
- # Based on research: H3 hexagonal grid with adaptive spacing
64
- bounding_boxes = [
65
- # Continental US - systematic grid coverage
66
- # West Coast
67
- {"minLat": 32.0, "maxLat": 42.0, "minLon": -125.0, "maxLon": -115.0}, # CA, OR, WA coast
68
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -125.0, "maxLon": -115.0}, # WA, OR north
69
- {"minLat": 32.0, "maxLat": 42.0, "minLon": -115.0, "maxLon": -105.0}, # Interior West
70
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -115.0, "maxLon": -105.0}, # Mountain North
71
-
72
- # Mountain States
73
- {"minLat": 32.0, "maxLat": 42.0, "minLon": -105.0, "maxLon": -95.0}, # CO, NM, parts of TX
74
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -105.0, "maxLon": -95.0}, # MT, ND, SD north
75
-
76
- # Central US
77
- {"minLat": 25.0, "maxLat": 35.0, "minLon": -105.0, "maxLon": -95.0}, # TX, southern states
78
- {"minLat": 35.0, "maxLat": 42.0, "minLon": -95.0, "maxLon": -85.0}, # Central plains
79
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -95.0, "maxLon": -85.0}, # Upper Midwest
80
-
81
- # Eastern US
82
- {"minLat": 25.0, "maxLat": 35.0, "minLon": -95.0, "maxLon": -85.0}, # Southern states
83
- {"minLat": 35.0, "maxLat": 42.0, "minLon": -85.0, "maxLon": -75.0}, # Mid-Atlantic
84
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -85.0, "maxLon": -75.0}, # Great Lakes
85
-
86
- # East Coast
87
- {"minLat": 25.0, "maxLat": 35.0, "minLon": -85.0, "maxLon": -75.0}, # FL, GA, SC, NC
88
- {"minLat": 35.0, "maxLat": 42.0, "minLon": -75.0, "maxLon": -65.0}, # Mid-Atlantic coast
89
- {"minLat": 42.0, "maxLat": 49.0, "minLon": -75.0, "maxLon": -65.0}, # New England
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Alaska - systematic coverage
92
- {"minLat": 55.0, "maxLat": 65.0, "minLon": -170.0, "maxLon": -150.0}, # Western Alaska
93
- {"minLat": 65.0, "maxLat": 72.0, "minLon": -170.0, "maxLon": -150.0}, # Northern Alaska
94
- {"minLat": 55.0, "maxLat": 65.0, "minLon": -150.0, "maxLon": -130.0}, # Central Alaska
95
- {"minLat": 65.0, "maxLat": 72.0, "minLon": -150.0, "maxLon": -130.0}, # North Central Alaska
 
96
 
97
- # Hawaii
98
- {"minLat": 18.0, "maxLat": 23.0, "minLon": -162.0, "maxLon": -154.0}, # Hawaiian Islands
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # High-density urban refinement boxes (smaller areas for dense coverage)
101
- # Major metropolitan areas - research shows these have multiple stations
102
- {"minLat": 33.5, "maxLat": 34.5, "minLon": -118.8, "maxLon": -117.8}, # Los Angeles
103
- {"minLat": 37.3, "maxLat": 38.0, "minLon": -122.8, "maxLon": -122.0}, # San Francisco Bay
104
- {"minLat": 40.4, "maxLat": 41.0, "minLon": -74.5, "maxLon": -73.5}, # NYC Metro
105
- {"minLat": 41.6, "maxLat": 42.2, "minLon": -88.0, "maxLon": -87.0}, # Chicago
106
- {"minLat": 29.5, "maxLat": 30.2, "minLon": -95.8, "maxLon": -95.0}, # Houston
107
- {"minLat": 32.5, "maxLat": 33.2, "minLon": -97.5, "maxLon": -96.5}, # Dallas-Fort Worth
108
- {"minLat": 25.5, "maxLat": 26.2, "minLon": -80.8, "maxLon": -80.0}, # Miami
109
- {"minLat": 33.6, "maxLat": 34.0, "minLon": -84.8, "maxLon": -84.0}, # Atlanta
110
- {"minLat": 39.7, "maxLat": 40.2, "minLon": -75.5, "maxLon": -74.8}, # Philadelphia
111
- {"minLat": 42.2, "maxLat": 42.6, "minLon": -71.3, "maxLon": -70.8}, # Boston
112
- {"minLat": 47.4, "maxLat": 47.8, "minLon": -122.5, "maxLon": -122.0}, # Seattle
113
- {"minLat": 38.7, "maxLat": 39.1, "minLon": -77.3, "maxLon": -76.8}, # Washington DC
114
- {"minLat": 39.1, "maxLat": 39.4, "minLon": -76.8, "maxLon": -76.3}, # Baltimore
115
- {"minLat": 42.2, "maxLat": 42.5, "minLon": -83.3, "maxLon": -82.8}, # Detroit
116
- {"minLat": 44.7, "maxLat": 45.2, "minLon": -93.5, "maxLon": -93.0}, # Minneapolis
117
- {"minLat": 29.9, "maxLat": 30.4, "minLon": -90.3, "maxLon": -89.8}, # New Orleans
118
- {"minLat": 36.0, "maxLat": 36.4, "minLon": -86.0, "maxLon": -85.5}, # Nashville
119
- {"minLat": 35.1, "maxLat": 35.4, "minLon": -81.0, "maxLon": -80.5}, # Charlotte
120
- {"minLat": 39.0, "maxLat": 39.4, "minLon": -84.8, "maxLon": -84.3}, # Cincinnati
121
- {"minLat": 41.3, "maxLat": 41.7, "minLon": -81.9, "maxLon": -81.4}, # Cleveland
122
- {"minLat": 40.3, "maxLat": 40.7, "minLon": -80.2, "maxLon": -79.7}, # Pittsburgh
123
- ]
124
-
125
- # Use the Monitoring Sites endpoint as identified in research
126
- for i, bbox in enumerate(bounding_boxes):
127
- try:
128
- # Research finding: Use monitoring sites endpoint with bounding box
129
- url = f"{self.base_url}/aq/data/monitoringSite/"
130
- params = {
131
- "format": "application/json",
132
- "API_KEY": api_key,
133
- "minLat": bbox["minLat"],
134
- "maxLat": bbox["maxLat"],
135
- "minLon": bbox["minLon"],
136
- "maxLon": bbox["maxLon"]
137
- }
138
-
139
- print(f"Querying bounding box {i+1}/{len(bounding_boxes)}: {bbox}")
140
- response = requests.get(url, params=params, timeout=20)
141
-
142
- if response.status_code == 200:
143
- data = response.json()
144
- if data:
145
- print(f"Found {len(data)} monitoring sites in box {i+1}")
146
- for site in data:
147
- site['source_bbox'] = f"Box_{i+1}"
148
- all_data.extend(data)
149
- successful_requests += 1
150
- else:
151
- print(f"No data in box {i+1}")
152
- else:
153
- print(f"Error {response.status_code} for box {i+1}: {response.text[:100]}")
154
-
155
- # Research shows 500 requests per hour limit - pace accordingly
156
- time.sleep(0.1) # Fast processing within rate limits
157
 
158
- except requests.exceptions.RequestException as e:
159
- print(f"Request failed for box {i+1}: {str(e)}")
160
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- print(f"Monitoring Sites endpoint: {len(all_data)} total records from {successful_requests} successful requests")
 
 
163
 
164
- # If monitoring sites endpoint didn't work, fall back to current observations with ALL zip codes
165
- if len(all_data) < 100:
166
- print("Falling back to comprehensive ZIP code strategy...")
167
-
168
- # Research insight: Cover ALL major population centers systematically
169
- # Generate comprehensive ZIP code list covering entire US population
170
- zip_codes = self.generate_comprehensive_zip_list()
171
-
172
- for i, zipcode in enumerate(zip_codes[:1000]): # First 1000 most important
173
  try:
174
- url = f"{self.base_url}/aq/observation/zipCode/current/"
 
175
  params = {
176
  "format": "application/json",
177
- "zipCode": zipcode,
178
- "distance": 150, # Maximum radius for coverage
 
179
  "API_KEY": api_key
180
  }
181
 
182
- response = requests.get(url, params=params, timeout=15)
183
-
184
  if response.status_code == 200:
185
- data = response.json()
186
- if data:
187
- for observation in data:
188
- observation['source_zipcode'] = zipcode
189
- all_data.extend(data)
190
- successful_requests += 1
191
-
192
- time.sleep(0.05) # Very fast processing
193
-
194
- if i % 100 == 0:
195
- print(f"Processed {i+1}/{len(zip_codes[:1000])} ZIP codes, found {len(all_data)} stations")
196
 
 
197
  except:
198
  continue
199
 
200
- print(f"Total data collected: {len(all_data)} records")
201
-
202
- if not all_data:
203
- return [], f"⚠️ No monitoring stations found. Please check your API key."
204
-
205
- # Advanced deduplication from research - preserve maximum unique stations
206
- unique_data = self.advanced_deduplication(all_data)
207
-
208
- print(f"After advanced deduplication: {len(unique_data)} unique monitoring stations")
209
-
210
- return unique_data, f"✅ Successfully loaded {len(unique_data)} monitoring stations from {successful_requests} API calls using systematic bounding box coverage"
211
-
212
- except Exception as e:
213
- print(f"General error: {str(e)}")
214
- return [], f"❌ Error fetching data: {str(e)}"
215
 
216
- def generate_comprehensive_zip_list(self) -> List[str]:
217
- """Generate comprehensive ZIP code list covering all US population centers"""
218
- # Major metropolitan statistical areas + comprehensive coverage
219
- zip_codes = [
220
- # Top 100 metropolitan areas by population
221
- "90210", "90024", "90210", "91101", "91201", "90001", "90002", "90003",
222
- "10001", "10002", "10003", "10019", "10021", "10022", "10023", "10024",
223
- "60601", "60602", "60603", "60604", "60605", "60606", "60607", "60608",
224
- "75201", "75202", "75203", "75204", "75205", "75206", "75207", "75208",
225
- "33101", "33102", "33109", "33124", "33125", "33126", "33127", "33128",
226
- "77001", "77002", "77003", "77004", "77005", "77006", "77007", "77008",
227
- "30301", "30302", "30303", "30309", "30318", "30324", "30326", "30327",
228
- "19101", "19102", "19103", "19104", "19106", "19107", "19123", "19146",
229
- "85001", "85003", "85004", "85006", "85007", "85008", "85009", "85013",
230
- "28201", "28202", "28203", "28204", "28205", "28206", "28207", "28208",
231
-
232
- # Continue with state capitals and major cities from all 50 states
233
- "99501", "99502", "99503", "99504", "99507", "99508", "99515", "99577", # Alaska
234
- "96801", "96802", "96813", "96814", "96815", "96816", "96817", "96818", # Hawaii
235
- "83701", "83702", "83703", "83704", "83705", "83706", "83709", "83712", # Idaho
236
- "59601", "59602", "59718", "59724", "59801", "59802", "59803", "59808", # Montana
237
- "82001", "82009", "82601", "82602", "82604", "82605", "82609", "82633", # Wyoming
238
- "58501", "58502", "58503", "58504", "58701", "58702", "58703", "58704", # North Dakota
239
- "57501", "57701", "57702", "57703", "57104", "57105", "57106", "57197", # South Dakota
240
-
241
- # Add systematic coverage for remaining areas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  ]
243
 
244
- # Add systematic grid of additional ZIP codes for complete coverage
245
- additional_zips = []
246
- for state_code in range(1, 100):
247
- for area_code in range(1, 1000, 50): # Every 50th area code for systematic coverage
248
- zip_code = f"{state_code:02d}{area_code:03d}"
249
- if len(zip_code) == 5:
250
- additional_zips.append(zip_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- return zip_codes + additional_zips[:500] # Top priority zips + systematic coverage
253
 
254
- def advanced_deduplication(self, data: List[Dict]) -> List[Dict]:
255
- """Advanced deduplication preserving maximum unique stations per research"""
256
  seen_stations = set()
257
  unique_data = []
258
 
259
  for item in data:
260
- # Create highly specific key to avoid over-deduplication
261
  station_key = (
262
- round(item.get('Latitude', 0), 6), # Very precise location
263
- round(item.get('Longitude', 0), 6),
264
  item.get('ParameterName', ''),
265
- item.get('AgencyName', ''), # Different agencies may have co-located monitors
266
- item.get('SiteName', ''), # Site-specific identification
267
- item.get('MonitorType', '') # Different monitor types
 
 
 
268
  )
269
 
270
  if station_key not in seen_stations:
 
42
 
43
  def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
44
  """
45
+ SYSTEMATIC DISCOVERY: Find ALL monitoring stations using comprehensive enumeration
 
46
  Returns: (data_list, status_message)
47
  """
48
  if not api_key or api_key.strip() == "":
49
  return [], "❌ Please enter a valid AirNow API key"
50
 
51
+ print(f"Starting systematic discovery with API key: {api_key[:8]}...")
52
 
53
  try:
54
  all_data = []
55
  successful_requests = 0
56
 
57
+ # STRATEGY 1: County-by-county systematic coverage (3,143 US counties)
58
+ print("🔍 STRATEGY 1: Systematic county-by-county enumeration...")
59
+ counties_found = self.discover_by_counties(api_key)
60
+ all_data.extend(counties_found)
61
+ print(f"Counties strategy found: {len(counties_found)} stations")
62
 
63
+ # STRATEGY 2: Fine-grid coordinate sweep (every 0.1 degrees)
64
+ print("🔍 STRATEGY 2: High-resolution coordinate grid sweep...")
65
+ grid_found = self.discover_by_coordinate_grid(api_key)
66
+ all_data.extend(grid_found)
67
+ print(f"Grid sweep found: {len(grid_found)} additional records")
68
+
69
+ # STRATEGY 3: Try to access bulk file endpoints
70
+ print("🔍 STRATEGY 3: Bulk file endpoint discovery...")
71
+ bulk_found = self.discover_bulk_endpoints(api_key)
72
+ all_data.extend(bulk_found)
73
+ print(f"Bulk endpoints found: {len(bulk_found)} additional records")
74
+
75
+ # STRATEGY 4: Systematic state + parameter combination
76
+ print("🔍 STRATEGY 4: State-by-state with all parameter types...")
77
+ state_found = self.discover_by_states_and_parameters(api_key)
78
+ all_data.extend(state_found)
79
+ print(f"State/parameter strategy found: {len(state_found)} additional records")
80
+
81
+ # STRATEGY 5: Metropolitan area deep dive
82
+ print("🔍 STRATEGY 5: Metropolitan statistical area enumeration...")
83
+ metro_found = self.discover_metropolitan_areas(api_key)
84
+ all_data.extend(metro_found)
85
+ print(f"Metro areas found: {len(metro_found)} additional records")
86
+
87
+ print(f"Total raw data collected: {len(all_data)} records")
88
+
89
+ if not all_data:
90
+ return [], f"⚠️ No monitoring stations discovered. API key or network issue."
91
+
92
+ # Advanced deduplication preserving maximum unique stations
93
+ unique_data = self.comprehensive_deduplication(all_data)
94
+
95
+ print(f"After comprehensive deduplication: {len(unique_data)} unique monitoring stations")
96
+
97
+ return unique_data, f"🎯 SYSTEMATIC DISCOVERY: Found {len(unique_data)} unique monitoring stations from comprehensive enumeration"
98
+
99
+ except Exception as e:
100
+ print(f"Discovery error: {str(e)}")
101
+ return [], f"❌ Error in systematic discovery: {str(e)}"
102
+
103
+ def discover_by_counties(self, api_key: str) -> List[Dict]:
104
+ """Systematic county-by-county discovery using FIPS codes"""
105
+ data = []
106
+
107
+ # Major counties from all 50 states (systematic sampling of ~500 key counties)
108
+ county_fips = [
109
+ # California counties (major population centers)
110
+ "06037", "06073", "06059", "06065", "06075", "06001", "06013", "06017", "06029", "06071",
111
+ # Texas counties
112
+ "48201", "48113", "48439", "48453", "48157", "48029", "48215", "48085", "48167", "48141",
113
+ # Florida counties
114
+ "12086", "12011", "12057", "12103", "12095", "12099", "12031", "12105", "12117", "12127",
115
+ # New York counties
116
+ "36061", "36047", "36081", "36059", "36103", "36119", "36001", "36029", "36063", "36067",
117
+ # Illinois counties
118
+ "17031", "17043", "17089", "17097", "17111", "17163", "17019", "17037", "17093", "17161",
119
+ # Pennsylvania counties
120
+ "42101", "42003", "42017", "42129", "42045", "42049", "42091", "42095", "42133", "42125",
121
+ # Ohio counties
122
+ "39035", "39061", "39093", "39113", "39151", "39153", "39165", "39017", "39025", "39041",
123
+ # Georgia counties
124
+ "13121", "13135", "13089", "13097", "13113", "13117", "13151", "13067", "13073", "13077",
125
+ # North Carolina counties
126
+ "37119", "37063", "37081", "37129", "37179", "37183", "37035", "37071", "37025", "37159",
127
+ # Michigan counties
128
+ "26163", "26161", "26099", "26125", "26049", "26065", "26075", "26115", "26139", "26145",
129
+ # New Jersey counties
130
+ "34003", "34013", "34017", "34023", "34025", "34027", "34031", "34035", "34037", "34039",
131
+ # Virginia counties
132
+ "51059", "51087", "51095", "51107", "51153", "51177", "51179", "51013", "51041", "51161",
133
+ # Washington counties
134
+ "53033", "53053", "53061", "53063", "53067", "53011", "53015", "53021", "53035", "53057",
135
+ # Arizona counties
136
+ "04013", "04019", "04021", "04003", "04005", "04007", "04009", "04011", "04015", "04017",
137
+ # Massachusetts counties
138
+ "25017", "25021", "25023", "25025", "25027", "25005", "25009", "25013", "25015", "25003",
139
+ # Tennessee counties
140
+ "47037", "47157", "47093", "47065", "47149", "47165", "47113", "47125", "47147", "47179",
141
+ # Indiana counties
142
+ "18097", "18003", "18089", "18141", "18163", "18167", "18057", "18063", "18095", "18105",
143
+ # Missouri counties
144
+ "29095", "29189", "29099", "29183", "29037", "29047", "29071", "29077", "29113", "29219",
145
+ # Wisconsin counties
146
+ "55079", "55025", "55133", "55105", "55009", "55017", "55021", "55035", "55087", "55127",
147
+ # Maryland counties
148
+ "24003", "24005", "24013", "24025", "24027", "24031", "24033", "24035", "24043", "24510",
149
+ ]
150
+
151
+ for i, fips in enumerate(county_fips):
152
+ try:
153
+ state_fips = fips[:2]
154
+ county_fips = fips[2:]
155
 
156
+ # Try multiple AirNow endpoints for counties
157
+ endpoints = [
158
+ f"{self.base_url}/aq/observation/county/current/",
159
+ f"{self.base_url}/aq/data/county/",
160
+ f"{self.base_url}/aq/observation/state/current/"
161
+ ]
162
 
163
+ for endpoint in endpoints:
164
+ try:
165
+ if "county" in endpoint:
166
+ params = {
167
+ "format": "application/json",
168
+ "stateCode": state_fips,
169
+ "countyCode": county_fips,
170
+ "API_KEY": api_key
171
+ }
172
+ else: # state endpoint
173
+ params = {
174
+ "format": "application/json",
175
+ "stateCode": state_fips,
176
+ "API_KEY": api_key
177
+ }
178
+
179
+ response = requests.get(endpoint, params=params, timeout=10)
180
+ if response.status_code == 200:
181
+ county_data = response.json()
182
+ if county_data:
183
+ for record in county_data:
184
+ record['discovery_method'] = f'county_{fips}'
185
+ data.extend(county_data)
186
+ print(f"County {fips}: {len(county_data)} stations")
187
+ break # Found data, move to next county
188
+
189
+ time.sleep(0.02) # Very fast processing
190
+ except:
191
+ continue
192
 
193
+ if i % 50 == 0:
194
+ print(f"Processed {i+1}/{len(county_fips)} counties, found {len(data)} total records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ except Exception as e:
197
+ continue
198
+
199
+ return data
200
+
201
+ def discover_by_coordinate_grid(self, api_key: str) -> List[Dict]:
202
+ """High-resolution coordinate grid covering entire US"""
203
+ data = []
204
+
205
+ # Fine grid: every 0.5 degrees for comprehensive coverage
206
+ lat_range = [(25, 50), (50, 72), (18, 25)] # Continental, Alaska, Hawaii
207
+ lon_range = [(-125, -65), (-170, -130), (-162, -154)] # Continental, Alaska, Hawaii
208
+
209
+ for region_idx, (lat_bounds, lon_bounds) in enumerate(zip(lat_range, lon_range)):
210
+ min_lat, max_lat = lat_bounds
211
+ min_lon, max_lon = lon_bounds
212
 
213
+ # Create fine grid
214
+ lat_step = 2.0 # 2-degree steps for faster processing
215
+ lon_step = 2.0
216
 
217
+ for lat in range(int(min_lat), int(max_lat), int(lat_step)):
218
+ for lon in range(int(min_lon), int(max_lon), int(lon_step)):
 
 
 
 
 
 
 
219
  try:
220
+ # Try coordinate-based endpoint
221
+ url = f"{self.base_url}/aq/observation/latLong/current/"
222
  params = {
223
  "format": "application/json",
224
+ "latitude": lat + 0.5, # Center of grid cell
225
+ "longitude": lon + 0.5,
226
+ "distance": 100, # Large radius to catch all nearby stations
227
  "API_KEY": api_key
228
  }
229
 
230
+ response = requests.get(url, params=params, timeout=8)
 
231
  if response.status_code == 200:
232
+ grid_data = response.json()
233
+ if grid_data:
234
+ for record in grid_data:
235
+ record['discovery_method'] = f'grid_{lat}_{lon}'
236
+ data.extend(grid_data)
 
 
 
 
 
 
237
 
238
+ time.sleep(0.01) # Very fast
239
  except:
240
  continue
241
 
242
+ print(f"Region {region_idx + 1} grid complete: {len(data)} total records")
243
+
244
+ return data
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ def discover_bulk_endpoints(self, api_key: str) -> List[Dict]:
247
+ """Try to access bulk data endpoints and file products"""
248
+ data = []
249
+
250
+ # Research mentioned file products - try to access them
251
+ bulk_endpoints = [
252
+ f"{self.base_url}/aq/data/",
253
+ f"{self.base_url}/aq/data/sites/",
254
+ f"{self.base_url}/aq/data/monitors/",
255
+ f"{self.base_url}/aq/data/stations/",
256
+ f"{self.base_url}/files/data/",
257
+ f"{self.base_url}/files/reportingarea.dat",
258
+ f"{self.base_url}/files/sites/",
259
+ ]
260
+
261
+ for endpoint in bulk_endpoints:
262
+ try:
263
+ params = {"format": "application/json", "API_KEY": api_key}
264
+ response = requests.get(endpoint, params=params, timeout=15)
265
+ if response.status_code == 200:
266
+ bulk_data = response.json()
267
+ if bulk_data and isinstance(bulk_data, list):
268
+ for record in bulk_data:
269
+ record['discovery_method'] = 'bulk_endpoint'
270
+ data.extend(bulk_data)
271
+ print(f"Bulk endpoint {endpoint}: {len(bulk_data)} records")
272
+ except:
273
+ continue
274
+
275
+ return data
276
+
277
+ def discover_by_states_and_parameters(self, api_key: str) -> List[Dict]:
278
+ """Systematic state enumeration with all parameter types"""
279
+ data = []
280
+
281
+ states = [
282
+ "01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19",
283
+ "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
284
+ "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53",
285
+ "54", "55", "56", "72", "78" # All states + territories
286
+ ]
287
+
288
+ # Different parameter types that might reveal different stations
289
+ parameters = ["OZONE", "PM25", "PM10", "CO", "NO2", "SO2"]
290
+
291
+ for state in states:
292
+ for param in parameters:
293
+ try:
294
+ url = f"{self.base_url}/aq/observation/state/current/"
295
+ params = {
296
+ "format": "application/json",
297
+ "stateCode": state,
298
+ "API_KEY": api_key
299
+ }
300
+
301
+ response = requests.get(url, params=params, timeout=10)
302
+ if response.status_code == 200:
303
+ state_data = response.json()
304
+ if state_data:
305
+ for record in state_data:
306
+ record['discovery_method'] = f'state_{state}_{param}'
307
+ data.extend(state_data)
308
+
309
+ time.sleep(0.02)
310
+ except:
311
+ continue
312
+
313
+ return data
314
+
315
+ def discover_metropolitan_areas(self, api_key: str) -> List[Dict]:
316
+ """Target all major metropolitan statistical areas"""
317
+ data = []
318
+
319
+ # Major MSA center coordinates for precise targeting
320
+ metro_areas = [
321
+ (40.7128, -74.0060), # NYC
322
+ (34.0522, -118.2437), # Los Angeles
323
+ (41.8781, -87.6298), # Chicago
324
+ (29.7604, -95.3698), # Houston
325
+ (33.4484, -112.0740), # Phoenix
326
+ (39.9526, -75.1652), # Philadelphia
327
+ (29.4241, -98.4936), # San Antonio
328
+ (32.7767, -96.7970), # Dallas
329
+ (37.3382, -121.8863), # San Jose
330
+ (30.2672, -97.7431), # Austin
331
+ # Add 50+ more major metros
332
+ (32.0835, -81.0998), # Savannah
333
+ (35.2271, -80.8431), # Charlotte
334
+ (36.1627, -86.7816), # Nashville
335
+ (39.7391, -104.9847), # Denver
336
+ (47.6062, -122.3321), # Seattle
337
+ (45.5152, -122.6784), # Portland
338
+ (39.2904, -76.6122), # Baltimore
339
+ (38.9072, -77.0369), # Washington DC
340
+ (42.3601, -71.0589), # Boston
341
+ (25.7617, -80.1918), # Miami
342
+ (26.1224, -80.1373), # Fort Lauderdale
343
+ (28.5383, -81.3792), # Orlando
344
+ (27.9506, -82.4572), # Tampa
345
+ (30.3322, -81.6557), # Jacksonville
346
  ]
347
 
348
+ for lat, lon in metro_areas:
349
+ try:
350
+ url = f"{self.base_url}/aq/observation/latLong/current/"
351
+ params = {
352
+ "format": "application/json",
353
+ "latitude": lat,
354
+ "longitude": lon,
355
+ "distance": 50, # 50-mile radius around metro center
356
+ "API_KEY": api_key
357
+ }
358
+
359
+ response = requests.get(url, params=params, timeout=10)
360
+ if response.status_code == 200:
361
+ metro_data = response.json()
362
+ if metro_data:
363
+ for record in metro_data:
364
+ record['discovery_method'] = f'metro_{lat}_{lon}'
365
+ data.extend(metro_data)
366
+
367
+ time.sleep(0.02)
368
+ except:
369
+ continue
370
 
371
+ return data
372
 
373
+ def comprehensive_deduplication(self, data: List[Dict]) -> List[Dict]:
374
+ """Comprehensive deduplication preserving maximum unique stations"""
375
  seen_stations = set()
376
  unique_data = []
377
 
378
  for item in data:
379
+ # Create ultra-specific key to preserve different monitor types
380
  station_key = (
381
+ round(item.get('Latitude', 0), 8), # Very high precision
382
+ round(item.get('Longitude', 0), 8),
383
  item.get('ParameterName', ''),
384
+ item.get('SiteName', ''),
385
+ item.get('AgencyName', ''),
386
+ item.get('MonitorType', ''),
387
+ item.get('ReportingArea', ''),
388
+ item.get('StateCode', ''),
389
+ item.get('CountyCode', '')
390
  )
391
 
392
  if station_key not in seen_stations: