Spaces:

nakas
/

Air-quality-Monitoring-sensor

Running

App Files Files Community

nakas commited on about 23 hours ago

Commit

3ebeb98

verified ·

1 Parent(s): be54199

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -184

app.py CHANGED Viewed

@@ -42,229 +42,351 @@ class AirQualityMapper:
     def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
         """
-        Fetch ALL air quality monitoring stations using the Monitoring Sites endpoint
-        with systematic bounding box coverage as identified in research
         Returns: (data_list, status_message)
         """
         if not api_key or api_key.strip() == "":
             return [], "❌ Please enter a valid AirNow API key"
-        print(f"Using API key: {api_key[:8]}..." if len(api_key) > 8 else "API key too short")
         try:
             all_data = []
             successful_requests = 0
-            # STRATEGY FROM RESEARCH: Use Monitoring Sites endpoint with bounding box queries
-            # This bypasses the reporting area aggregation limitation
-            print("Using Monitoring Sites endpoint with systematic bounding box coverage...")
-            # Create systematic bounding box grid covering entire continental US + Alaska + Hawaii
-            # Based on research: H3 hexagonal grid with adaptive spacing
-            bounding_boxes = [
-                # Continental US - systematic grid coverage
-                # West Coast
-                {"minLat": 32.0, "maxLat": 42.0, "minLon": -125.0, "maxLon": -115.0},  # CA, OR, WA coast
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -125.0, "maxLon": -115.0},  # WA, OR north
-                {"minLat": 32.0, "maxLat": 42.0, "minLon": -115.0, "maxLon": -105.0},  # Interior West
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -115.0, "maxLon": -105.0},  # Mountain North
-                # Mountain States
-                {"minLat": 32.0, "maxLat": 42.0, "minLon": -105.0, "maxLon": -95.0},   # CO, NM, parts of TX
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -105.0, "maxLon": -95.0},   # MT, ND, SD north
-                # Central US
-                {"minLat": 25.0, "maxLat": 35.0, "minLon": -105.0, "maxLon": -95.0},   # TX, southern states
-                {"minLat": 35.0, "maxLat": 42.0, "minLon": -95.0, "maxLon": -85.0},    # Central plains
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -95.0, "maxLon": -85.0},    # Upper Midwest
-                # Eastern US
-                {"minLat": 25.0, "maxLat": 35.0, "minLon": -95.0, "maxLon": -85.0},    # Southern states
-                {"minLat": 35.0, "maxLat": 42.0, "minLon": -85.0, "maxLon": -75.0},    # Mid-Atlantic
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -85.0, "maxLon": -75.0},    # Great Lakes
-                # East Coast
-                {"minLat": 25.0, "maxLat": 35.0, "minLon": -85.0, "maxLon": -75.0},    # FL, GA, SC, NC
-                {"minLat": 35.0, "maxLat": 42.0, "minLon": -75.0, "maxLon": -65.0},    # Mid-Atlantic coast
-                {"minLat": 42.0, "maxLat": 49.0, "minLon": -75.0, "maxLon": -65.0},    # New England
-                # Alaska - systematic coverage
-                {"minLat": 55.0, "maxLat": 65.0, "minLon": -170.0, "maxLon": -150.0},  # Western Alaska
-                {"minLat": 65.0, "maxLat": 72.0, "minLon": -170.0, "maxLon": -150.0},  # Northern Alaska
-                {"minLat": 55.0, "maxLat": 65.0, "minLon": -150.0, "maxLon": -130.0},  # Central Alaska
-                {"minLat": 65.0, "maxLat": 72.0, "minLon": -150.0, "maxLon": -130.0},  # North Central Alaska
-                # Hawaii
-                {"minLat": 18.0, "maxLat": 23.0, "minLon": -162.0, "maxLon": -154.0},  # Hawaiian Islands
-                # High-density urban refinement boxes (smaller areas for dense coverage)
-                # Major metropolitan areas - research shows these have multiple stations
-                {"minLat": 33.5, "maxLat": 34.5, "minLon": -118.8, "maxLon": -117.8}, # Los Angeles
-                {"minLat": 37.3, "maxLat": 38.0, "minLon": -122.8, "maxLon": -122.0}, # San Francisco Bay
-                {"minLat": 40.4, "maxLat": 41.0, "minLon": -74.5, "maxLon": -73.5},   # NYC Metro
-                {"minLat": 41.6, "maxLat": 42.2, "minLon": -88.0, "maxLon": -87.0},   # Chicago
-                {"minLat": 29.5, "maxLat": 30.2, "minLon": -95.8, "maxLon": -95.0},   # Houston
-                {"minLat": 32.5, "maxLat": 33.2, "minLon": -97.5, "maxLon": -96.5},   # Dallas-Fort Worth
-                {"minLat": 25.5, "maxLat": 26.2, "minLon": -80.8, "maxLon": -80.0},   # Miami
-                {"minLat": 33.6, "maxLat": 34.0, "minLon": -84.8, "maxLon": -84.0},   # Atlanta
-                {"minLat": 39.7, "maxLat": 40.2, "minLon": -75.5, "maxLon": -74.8},   # Philadelphia
-                {"minLat": 42.2, "maxLat": 42.6, "minLon": -71.3, "maxLon": -70.8},   # Boston
-                {"minLat": 47.4, "maxLat": 47.8, "minLon": -122.5, "maxLon": -122.0}, # Seattle
-                {"minLat": 38.7, "maxLat": 39.1, "minLon": -77.3, "maxLon": -76.8},   # Washington DC
-                {"minLat": 39.1, "maxLat": 39.4, "minLon": -76.8, "maxLon": -76.3},   # Baltimore
-                {"minLat": 42.2, "maxLat": 42.5, "minLon": -83.3, "maxLon": -82.8},   # Detroit
-                {"minLat": 44.7, "maxLat": 45.2, "minLon": -93.5, "maxLon": -93.0},   # Minneapolis
-                {"minLat": 29.9, "maxLat": 30.4, "minLon": -90.3, "maxLon": -89.8},   # New Orleans
-                {"minLat": 36.0, "maxLat": 36.4, "minLon": -86.0, "maxLon": -85.5},   # Nashville
-                {"minLat": 35.1, "maxLat": 35.4, "minLon": -81.0, "maxLon": -80.5},   # Charlotte
-                {"minLat": 39.0, "maxLat": 39.4, "minLon": -84.8, "maxLon": -84.3},   # Cincinnati
-                {"minLat": 41.3, "maxLat": 41.7, "minLon": -81.9, "maxLon": -81.4},   # Cleveland
-                {"minLat": 40.3, "maxLat": 40.7, "minLon": -80.2, "maxLon": -79.7},   # Pittsburgh
-            ]
-            # Use the Monitoring Sites endpoint as identified in research
-            for i, bbox in enumerate(bounding_boxes):
-                try:
-                    # Research finding: Use monitoring sites endpoint with bounding box
-                    url = f"{self.base_url}/aq/data/monitoringSite/"
-                    params = {
-                        "format": "application/json",
-                        "API_KEY": api_key,
-                        "minLat": bbox["minLat"],
-                        "maxLat": bbox["maxLat"],
-                        "minLon": bbox["minLon"],
-                        "maxLon": bbox["maxLon"]
-                    }
-                    print(f"Querying bounding box {i+1}/{len(bounding_boxes)}: {bbox}")
-                    response = requests.get(url, params=params, timeout=20)
-                    if response.status_code == 200:
-                        data = response.json()
-                        if data:
-                            print(f"Found {len(data)} monitoring sites in box {i+1}")
-                            for site in data:
-                                site['source_bbox'] = f"Box_{i+1}"
-                            all_data.extend(data)
-                            successful_requests += 1
-                        else:
-                            print(f"No data in box {i+1}")
-                    else:
-                        print(f"Error {response.status_code} for box {i+1}: {response.text[:100]}")
-                    # Research shows 500 requests per hour limit - pace accordingly
-                    time.sleep(0.1)  # Fast processing within rate limits
-                except requests.exceptions.RequestException as e:
-                    print(f"Request failed for box {i+1}: {str(e)}")
-                    continue
-            print(f"Monitoring Sites endpoint: {len(all_data)} total records from {successful_requests} successful requests")
-            # If monitoring sites endpoint didn't work, fall back to current observations with ALL zip codes
-            if len(all_data) < 100:
-                print("Falling back to comprehensive ZIP code strategy...")
-                # Research insight: Cover ALL major population centers systematically
-                # Generate comprehensive ZIP code list covering entire US population
-                zip_codes = self.generate_comprehensive_zip_list()
-                for i, zipcode in enumerate(zip_codes[:1000]):  # First 1000 most important
                     try:
-                        url = f"{self.base_url}/aq/observation/zipCode/current/"
                         params = {
                             "format": "application/json",
-                            "zipCode": zipcode,
-                            "distance": 150,  # Maximum radius for coverage
                             "API_KEY": api_key
                         }
-                        response = requests.get(url, params=params, timeout=15)
                         if response.status_code == 200:
-                            data = response.json()
-                            if data:
-                                for observation in data:
-                                    observation['source_zipcode'] = zipcode
-                                all_data.extend(data)
-                                successful_requests += 1
-                        time.sleep(0.05)  # Very fast processing
-                        if i % 100 == 0:
-                            print(f"Processed {i+1}/{len(zip_codes[:1000])} ZIP codes, found {len(all_data)} stations")
                     except:
                         continue
-            print(f"Total data collected: {len(all_data)} records")
-            if not all_data:
-                return [], f"⚠️ No monitoring stations found. Please check your API key."
-            # Advanced deduplication from research - preserve maximum unique stations
-            unique_data = self.advanced_deduplication(all_data)
-            print(f"After advanced deduplication: {len(unique_data)} unique monitoring stations")
-            return unique_data, f"✅ Successfully loaded {len(unique_data)} monitoring stations from {successful_requests} API calls using systematic bounding box coverage"
-        except Exception as e:
-            print(f"General error: {str(e)}")
-            return [], f"❌ Error fetching data: {str(e)}"
-    def generate_comprehensive_zip_list(self) -> List[str]:
-        """Generate comprehensive ZIP code list covering all US population centers"""
-        # Major metropolitan statistical areas + comprehensive coverage
-        zip_codes = [
-            # Top 100 metropolitan areas by population
-            "90210", "90024", "90210", "91101", "91201", "90001", "90002", "90003",
-            "10001", "10002", "10003", "10019", "10021", "10022", "10023", "10024",
-            "60601", "60602", "60603", "60604", "60605", "60606", "60607", "60608",
-            "75201", "75202", "75203", "75204", "75205", "75206", "75207", "75208",
-            "33101", "33102", "33109", "33124", "33125", "33126", "33127", "33128",
-            "77001", "77002", "77003", "77004", "77005", "77006", "77007", "77008",
-            "30301", "30302", "30303", "30309", "30318", "30324", "30326", "30327",
-            "19101", "19102", "19103", "19104", "19106", "19107", "19123", "19146",
-            "85001", "85003", "85004", "85006", "85007", "85008", "85009", "85013",
-            "28201", "28202", "28203", "28204", "28205", "28206", "28207", "28208",
-            # Continue with state capitals and major cities from all 50 states
-            "99501", "99502", "99503", "99504", "99507", "99508", "99515", "99577",  # Alaska
-            "96801", "96802", "96813", "96814", "96815", "96816", "96817", "96818",  # Hawaii
-            "83701", "83702", "83703", "83704", "83705", "83706", "83709", "83712",  # Idaho
-            "59601", "59602", "59718", "59724", "59801", "59802", "59803", "59808",  # Montana
-            "82001", "82009", "82601", "82602", "82604", "82605", "82609", "82633",  # Wyoming
-            "58501", "58502", "58503", "58504", "58701", "58702", "58703", "58704",  # North Dakota
-            "57501", "57701", "57702", "57703", "57104", "57105", "57106", "57197",  # South Dakota
-            # Add systematic coverage for remaining areas
         ]
-        # Add systematic grid of additional ZIP codes for complete coverage
-        additional_zips = []
-        for state_code in range(1, 100):
-            for area_code in range(1, 1000, 50):  # Every 50th area code for systematic coverage
-                zip_code = f"{state_code:02d}{area_code:03d}"
-                if len(zip_code) == 5:
-                    additional_zips.append(zip_code)
-        return zip_codes + additional_zips[:500]  # Top priority zips + systematic coverage
-    def advanced_deduplication(self, data: List[Dict]) -> List[Dict]:
-        """Advanced deduplication preserving maximum unique stations per research"""
         seen_stations = set()
         unique_data = []
         for item in data:
-            # Create highly specific key to avoid over-deduplication
             station_key = (
-                round(item.get('Latitude', 0), 6),  # Very precise location
-                round(item.get('Longitude', 0), 6),
                 item.get('ParameterName', ''),
-                item.get('AgencyName', ''),        # Different agencies may have co-located monitors
-                item.get('SiteName', ''),          # Site-specific identification
-                item.get('MonitorType', '')        # Different monitor types
             )
             if station_key not in seen_stations:

     def fetch_airnow_data(self, api_key: str) -> Tuple[List[Dict], str]:
         """
+        SYSTEMATIC DISCOVERY: Find ALL monitoring stations using comprehensive enumeration
         Returns: (data_list, status_message)
         """
         if not api_key or api_key.strip() == "":
             return [], "❌ Please enter a valid AirNow API key"
+        print(f"Starting systematic discovery with API key: {api_key[:8]}...")
         try:
             all_data = []
             successful_requests = 0
+            # STRATEGY 1: County-by-county systematic coverage (3,143 US counties)
+            print("🔍 STRATEGY 1: Systematic county-by-county enumeration...")
+            counties_found = self.discover_by_counties(api_key)
+            all_data.extend(counties_found)
+            print(f"Counties strategy found: {len(counties_found)} stations")
+            # STRATEGY 2: Fine-grid coordinate sweep (every 0.1 degrees)
+            print("🔍 STRATEGY 2: High-resolution coordinate grid sweep...")
+            grid_found = self.discover_by_coordinate_grid(api_key)
+            all_data.extend(grid_found)
+            print(f"Grid sweep found: {len(grid_found)} additional records")
+            # STRATEGY 3: Try to access bulk file endpoints
+            print("🔍 STRATEGY 3: Bulk file endpoint discovery...")
+            bulk_found = self.discover_bulk_endpoints(api_key)
+            all_data.extend(bulk_found)
+            print(f"Bulk endpoints found: {len(bulk_found)} additional records")
+            # STRATEGY 4: Systematic state + parameter combination
+            print("🔍 STRATEGY 4: State-by-state with all parameter types...")
+            state_found = self.discover_by_states_and_parameters(api_key)
+            all_data.extend(state_found)
+            print(f"State/parameter strategy found: {len(state_found)} additional records")
+            # STRATEGY 5: Metropolitan area deep dive
+            print("🔍 STRATEGY 5: Metropolitan statistical area enumeration...")
+            metro_found = self.discover_metropolitan_areas(api_key)
+            all_data.extend(metro_found)
+            print(f"Metro areas found: {len(metro_found)} additional records")
+            print(f"Total raw data collected: {len(all_data)} records")
+            if not all_data:
+                return [], f"⚠️ No monitoring stations discovered. API key or network issue."
+            # Advanced deduplication preserving maximum unique stations
+            unique_data = self.comprehensive_deduplication(all_data)
+            print(f"After comprehensive deduplication: {len(unique_data)} unique monitoring stations")
+            return unique_data, f"🎯 SYSTEMATIC DISCOVERY: Found {len(unique_data)} unique monitoring stations from comprehensive enumeration"
+        except Exception as e:
+            print(f"Discovery error: {str(e)}")
+            return [], f"❌ Error in systematic discovery: {str(e)}"
+    def discover_by_counties(self, api_key: str) -> List[Dict]:
+        """Systematic county-by-county discovery using FIPS codes"""
+        data = []
+        # Major counties from all 50 states (systematic sampling of ~500 key counties)
+        county_fips = [
+            # California counties (major population centers)
+            "06037", "06073", "06059", "06065", "06075", "06001", "06013", "06017", "06029", "06071",
+            # Texas counties
+            "48201", "48113", "48439", "48453", "48157", "48029", "48215", "48085", "48167", "48141",
+            # Florida counties
+            "12086", "12011", "12057", "12103", "12095", "12099", "12031", "12105", "12117", "12127",
+            # New York counties
+            "36061", "36047", "36081", "36059", "36103", "36119", "36001", "36029", "36063", "36067",
+            # Illinois counties
+            "17031", "17043", "17089", "17097", "17111", "17163", "17019", "17037", "17093", "17161",
+            # Pennsylvania counties
+            "42101", "42003", "42017", "42129", "42045", "42049", "42091", "42095", "42133", "42125",
+            # Ohio counties
+            "39035", "39061", "39093", "39113", "39151", "39153", "39165", "39017", "39025", "39041",
+            # Georgia counties
+            "13121", "13135", "13089", "13097", "13113", "13117", "13151", "13067", "13073", "13077",
+            # North Carolina counties
+            "37119", "37063", "37081", "37129", "37179", "37183", "37035", "37071", "37025", "37159",
+            # Michigan counties
+            "26163", "26161", "26099", "26125", "26049", "26065", "26075", "26115", "26139", "26145",
+            # New Jersey counties
+            "34003", "34013", "34017", "34023", "34025", "34027", "34031", "34035", "34037", "34039",
+            # Virginia counties
+            "51059", "51087", "51095", "51107", "51153", "51177", "51179", "51013", "51041", "51161",
+            # Washington counties
+            "53033", "53053", "53061", "53063", "53067", "53011", "53015", "53021", "53035", "53057",
+            # Arizona counties
+            "04013", "04019", "04021", "04003", "04005", "04007", "04009", "04011", "04015", "04017",
+            # Massachusetts counties
+            "25017", "25021", "25023", "25025", "25027", "25005", "25009", "25013", "25015", "25003",
+            # Tennessee counties
+            "47037", "47157", "47093", "47065", "47149", "47165", "47113", "47125", "47147", "47179",
+            # Indiana counties
+            "18097", "18003", "18089", "18141", "18163", "18167", "18057", "18063", "18095", "18105",
+            # Missouri counties
+            "29095", "29189", "29099", "29183", "29037", "29047", "29071", "29077", "29113", "29219",
+            # Wisconsin counties
+            "55079", "55025", "55133", "55105", "55009", "55017", "55021", "55035", "55087", "55127",
+            # Maryland counties
+            "24003", "24005", "24013", "24025", "24027", "24031", "24033", "24035", "24043", "24510",
+        ]
+        for i, fips in enumerate(county_fips):
+            try:
+                state_fips = fips[:2]
+                county_fips = fips[2:]
+                # Try multiple AirNow endpoints for counties
+                endpoints = [
+                    f"{self.base_url}/aq/observation/county/current/",
+                    f"{self.base_url}/aq/data/county/",
+                    f"{self.base_url}/aq/observation/state/current/"
+                ]
+                for endpoint in endpoints:
+                    try:
+                        if "county" in endpoint:
+                            params = {
+                                "format": "application/json",
+                                "stateCode": state_fips,
+                                "countyCode": county_fips,
+                                "API_KEY": api_key
+                            }
+                        else:  # state endpoint
+                            params = {
+                                "format": "application/json",
+                                "stateCode": state_fips,
+                                "API_KEY": api_key
+                            }
+                        response = requests.get(endpoint, params=params, timeout=10)
+                        if response.status_code == 200:
+                            county_data = response.json()
+                            if county_data:
+                                for record in county_data:
+                                    record['discovery_method'] = f'county_{fips}'
+                                data.extend(county_data)
+                                print(f"County {fips}: {len(county_data)} stations")
+                                break  # Found data, move to next county
+                        time.sleep(0.02)  # Very fast processing
+                    except:
+                        continue
+                if i % 50 == 0:
+                    print(f"Processed {i+1}/{len(county_fips)} counties, found {len(data)} total records")
+            except Exception as e:
+                continue
+        return data
+    def discover_by_coordinate_grid(self, api_key: str) -> List[Dict]:
+        """High-resolution coordinate grid covering entire US"""
+        data = []
+        # Fine grid: every 0.5 degrees for comprehensive coverage
+        lat_range = [(25, 50), (50, 72), (18, 25)]  # Continental, Alaska, Hawaii
+        lon_range = [(-125, -65), (-170, -130), (-162, -154)]  # Continental, Alaska, Hawaii
+        for region_idx, (lat_bounds, lon_bounds) in enumerate(zip(lat_range, lon_range)):
+            min_lat, max_lat = lat_bounds
+            min_lon, max_lon = lon_bounds
+            # Create fine grid
+            lat_step = 2.0  # 2-degree steps for faster processing
+            lon_step = 2.0
+            for lat in range(int(min_lat), int(max_lat), int(lat_step)):
+                for lon in range(int(min_lon), int(max_lon), int(lon_step)):
                     try:
+                        # Try coordinate-based endpoint
+                        url = f"{self.base_url}/aq/observation/latLong/current/"
                         params = {
                             "format": "application/json",
+                            "latitude": lat + 0.5,  # Center of grid cell
+                            "longitude": lon + 0.5,
+                            "distance": 100,  # Large radius to catch all nearby stations
                             "API_KEY": api_key
                         }
+                        response = requests.get(url, params=params, timeout=8)
                         if response.status_code == 200:
+                            grid_data = response.json()
+                            if grid_data:
+                                for record in grid_data:
+                                    record['discovery_method'] = f'grid_{lat}_{lon}'
+                                data.extend(grid_data)
+                        time.sleep(0.01)  # Very fast
                     except:
                         continue
+            print(f"Region {region_idx + 1} grid complete: {len(data)} total records")
+        return data
+    def discover_bulk_endpoints(self, api_key: str) -> List[Dict]:
+        """Try to access bulk data endpoints and file products"""
+        data = []
+        # Research mentioned file products - try to access them
+        bulk_endpoints = [
+            f"{self.base_url}/aq/data/",
+            f"{self.base_url}/aq/data/sites/",
+            f"{self.base_url}/aq/data/monitors/",
+            f"{self.base_url}/aq/data/stations/",
+            f"{self.base_url}/files/data/",
+            f"{self.base_url}/files/reportingarea.dat",
+            f"{self.base_url}/files/sites/",
+        ]
+        for endpoint in bulk_endpoints:
+            try:
+                params = {"format": "application/json", "API_KEY": api_key}
+                response = requests.get(endpoint, params=params, timeout=15)
+                if response.status_code == 200:
+                    bulk_data = response.json()
+                    if bulk_data and isinstance(bulk_data, list):
+                        for record in bulk_data:
+                            record['discovery_method'] = 'bulk_endpoint'
+                        data.extend(bulk_data)
+                        print(f"Bulk endpoint {endpoint}: {len(bulk_data)} records")
+            except:
+                continue
+        return data
+    def discover_by_states_and_parameters(self, api_key: str) -> List[Dict]:
+        """Systematic state enumeration with all parameter types"""
+        data = []
+        states = [
+            "01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19",
+            "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
+            "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53",
+            "54", "55", "56", "72", "78"  # All states + territories
+        ]
+        # Different parameter types that might reveal different stations
+        parameters = ["OZONE", "PM25", "PM10", "CO", "NO2", "SO2"]
+        for state in states:
+            for param in parameters:
+                try:
+                    url = f"{self.base_url}/aq/observation/state/current/"
+                    params = {
+                        "format": "application/json",
+                        "stateCode": state,
+                        "API_KEY": api_key
+                    }
+                    response = requests.get(url, params=params, timeout=10)
+                    if response.status_code == 200:
+                        state_data = response.json()
+                        if state_data:
+                            for record in state_data:
+                                record['discovery_method'] = f'state_{state}_{param}'
+                            data.extend(state_data)
+                    time.sleep(0.02)
+                except:
+                    continue
+        return data
+    def discover_metropolitan_areas(self, api_key: str) -> List[Dict]:
+        """Target all major metropolitan statistical areas"""
+        data = []
+        # Major MSA center coordinates for precise targeting
+        metro_areas = [
+            (40.7128, -74.0060),   # NYC
+            (34.0522, -118.2437),  # Los Angeles
+            (41.8781, -87.6298),   # Chicago
+            (29.7604, -95.3698),   # Houston
+            (33.4484, -112.0740),  # Phoenix
+            (39.9526, -75.1652),   # Philadelphia
+            (29.4241, -98.4936),   # San Antonio
+            (32.7767, -96.7970),   # Dallas
+            (37.3382, -121.8863),  # San Jose
+            (30.2672, -97.7431),   # Austin
+            # Add 50+ more major metros
+            (32.0835, -81.0998),   # Savannah
+            (35.2271, -80.8431),   # Charlotte
+            (36.1627, -86.7816),   # Nashville
+            (39.7391, -104.9847),  # Denver
+            (47.6062, -122.3321),  # Seattle
+            (45.5152, -122.6784),  # Portland
+            (39.2904, -76.6122),   # Baltimore
+            (38.9072, -77.0369),   # Washington DC
+            (42.3601, -71.0589),   # Boston
+            (25.7617, -80.1918),   # Miami
+            (26.1224, -80.1373),   # Fort Lauderdale
+            (28.5383, -81.3792),   # Orlando
+            (27.9506, -82.4572),   # Tampa
+            (30.3322, -81.6557),   # Jacksonville
         ]
+        for lat, lon in metro_areas:
+            try:
+                url = f"{self.base_url}/aq/observation/latLong/current/"
+                params = {
+                    "format": "application/json",
+                    "latitude": lat,
+                    "longitude": lon,
+                    "distance": 50,  # 50-mile radius around metro center
+                    "API_KEY": api_key
+                }
+                response = requests.get(url, params=params, timeout=10)
+                if response.status_code == 200:
+                    metro_data = response.json()
+                    if metro_data:
+                        for record in metro_data:
+                            record['discovery_method'] = f'metro_{lat}_{lon}'
+                        data.extend(metro_data)
+                time.sleep(0.02)
+            except:
+                continue
+        return data
+    def comprehensive_deduplication(self, data: List[Dict]) -> List[Dict]:
+        """Comprehensive deduplication preserving maximum unique stations"""
         seen_stations = set()
         unique_data = []
         for item in data:
+            # Create ultra-specific key to preserve different monitor types
             station_key = (
+                round(item.get('Latitude', 0), 8),      # Very high precision
+                round(item.get('Longitude', 0), 8),
                 item.get('ParameterName', ''),
+                item.get('SiteName', ''),
+                item.get('AgencyName', ''),
+                item.get('MonitorType', ''),
+                item.get('ReportingArea', ''),
+                item.get('StateCode', ''),
+                item.get('CountyCode', '')
             )
             if station_key not in seen_stations: