import asyncio from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware import requests import pandas as pd import json import os,datetime import pandas as pd from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import LabelEncoder from sklearn.utils import resample from xgboost import XGBClassifier from sklearn.metrics import accuracy_score, classification_report from joblib import dump, load import numpy as np import requests import mysql.connector from mysql.connector import Error app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/trigger_the_model_trainer") async def train_the_model(): # Load the dataset #file_path = 'model/trainer_data.csv' # Update to the correct file path 'model/trainer_data_new.csv' #data = pd.read_csv(file_path) csv_files = ['model/trainer_data.csv','model/trainer_data2.csv','model/trainer_data3.csv','model/trainer_data4.csv'] data_frames = [pd.read_csv(file) for file in csv_files] # Step 4: Concatenate all DataFrames into a single DataFrame data = pd.concat(data_frames, ignore_index=True) #data = data.iloc[0:50000] # Analyze class distribution class_distribution = data['status_name'].value_counts() print("Class Distribution before balancing:\n", class_distribution) # Get the size of the largest class to match other classes' sizes max_class_size = class_distribution.max() # Oversampling oversampled_data = pd.DataFrame() for class_name, group in data.groupby('status_name'): oversampled_group = resample(group, replace=True, # Sample with replacement n_samples=max_class_size, # to match majority class random_state=123) # for reproducibility oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0) # Verify new class distribution print("Class Distribution after oversampling:\n", oversampled_data['status_name'].value_counts()) # Save the balanced dataset if needed #oversampled_data.to_csv('model/trainer_data_balanced.csv', index=False) data = pd.read_csv("model/trainer_data_new.csv") print(data["customer_name"].count()) data = pd.read_csv("model/trainer_data_balanced.csv") print(data["customer_name"].count()) data = oversampled_data print(data["customer_name"].count()) # Select columns selected_columns = ['customer_name', 'customer_address', 'customer_phone_no', 'weight','cod','pickup_address','client_number','destination_city', 'status_name'] # Handling missing values #data_filled = data[selected_columns].fillna('Missing') data_filled = data[selected_columns].dropna() # Encoding categorical variables encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'} for col, encoder in encoders.items(): data_filled[col] = encoder.fit_transform(data_filled[col]) # Splitting the dataset X = data_filled.drop('status_name', axis=1) y = data_filled['status_name'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Parameters to use for the model # Parameters to use for the model """params = { 'colsample_bytree': 0.3, 'learning_rate': 0.6, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.9, 'use_label_encoder': False, 'eval_metric': 'logloss' }""" params = { 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 30, 'n_estimators': 600, 'subsample': 0.9, 'use_label_encoder': False, 'eval_metric': 'logloss' } # Initialize the classifier with the specified parameters xgb = XGBClassifier(**params) # Train the model xgb.fit(X_train, y_train) # Predict on the test set y_pred = xgb.predict(X_test) y_pred_proba = xgb.predict_proba(X_test) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) classification_rep = classification_report(y_test, y_pred) # Save the model model_filename = 'model/transexpress_xgb_model.joblib' dump(xgb, model_filename) # Save the encoders encoders_filename = 'model/transexpress_encoders.joblib' dump(encoders, encoders_filename) return accuracy,classification_rep,"Model trained with new data" @app.get("/trigger_the_data_fecher") async def get_data(page: str,paginate: str): print("data fetcher running.....") # Initialize an empty DataFrame to store the combined data combined_df = pd.DataFrame() # Update the payload for each page url = "https://report.transexpress.lk/api/orders/delivery-success-rate/return-to-client-orders?page="+page+"&per_page="+paginate payload = {} headers = { 'Cookie': 'development_trans_express_session=NaFDGzh5WQCFwiortxA6WEFuBjsAG9GHIQrbKZ8B' } response = requests.request("GET", url, headers=headers, data=payload) # Sample JSON response json_response = response.json() # Extracting 'data' for conversion data = json_response["return_to_client_orders"]['data'] data_count = len(data) df = pd.json_normalize(data) df['status_name'] = df['status_name'].replace('Partially Delivered', 'Delivered') df['status_name'] = df['status_name'].replace('Received by Client', 'Returned to Client') print("data collected from page : "+page) #return "done" try: file_path = 'model/trainer_data5.csv' # Replace with your file path source_csv = pd.read_csv(file_path) new_data = df combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) combined_df_final.to_csv("model/trainer_data5.csv") print("data added") except: df.to_csv("model/trainer_data5.csv") print("data created") print({"page_number":page,"data_count":data_count}) return {"page_number":page,"data_count":data_count} @app.get("/get_module_versions") async def get_versions(): try: from pip._internal.operations import freeze except ImportError: # pip < 10.0 from pip.operations import freeze pkgs = freeze.freeze() for pkg in pkgs: print(pkg) return pkgs @app.get("/get_latest_model_updated_time") async def model_updated_time(): try: file_size = os.path.getsize("model/transexpress_xgb_model.joblib") m_time_encoder = os.path.getmtime('model/transexpress_encoders.joblib') m_time_model = os.path.getmtime('model/transexpress_xgb_model.joblib') return {"base model created time ":datetime.datetime.fromtimestamp(m_time_encoder), "last model updated time":datetime.datetime.fromtimestamp(m_time_model), "The size of the file is bytes":file_size } except: return {"no model found so first trained the model using data fecther"} # Database connection parameters DB_HOST = 'trans-prod-clone-staging.mysql.database.azure.com' DB_PORT = 3306 DB_DATABASE = 'defaultdb' DB_USERNAME = 'wwwdata' DB_PASSWORD = 'fcLa8F3sxgNYQ$K@%' # Connect to the database #calling this function for each request def fetch_customer_data(phone_number): #local connection connection = mysql.connector.connect( host=DB_HOST, port=DB_PORT, database=DB_DATABASE, user=DB_USERNAME, password=DB_PASSWORD ) #try: if connection.is_connected(): print("Connected to the database") # SQL query query = """ SELECT orders.customer_name AS customer_name, orders.address AS customer_address, orders.phone_no AS customer_phone_no, primary_statuses.name AS status_name FROM orders INNER JOIN statuses ON orders.status_id = statuses.id INNER JOIN primary_statuses ON statuses.name = primary_statuses.key WHERE orders.phone_no LIKE %s """ # Execute the query cursor = connection.cursor(dictionary=True) cursor.execute(query, (f"%{phone_number}%",)) # Fetch results results = cursor.fetchall() #print("Results:", results) #close conection #if connection.is_connected(): cursor.close() connection.close() print("Database connection closed") return results # except Error as e: # print(f"Error: {e}") # #close conection # #if connection.is_connected(): # cursor.close() # connection.close() # print("Database connection closed") # Endpoint for making predictions @app.post("/predict") async def predict( date : str, customer_name: str, customer_address: str, customer_phone: str, weight: float, cod: int, pickup_address: str, client_number:str, destination_city:str ): try: # Load your trained model and encoders xgb_model = load('model/transexpress_xgb_model.joblib') encoders = load('model/transexpress_encoders.joblib') except: return {"no model found so first trained the model using data fecther"} # Function to handle unseen labels during encoding def safe_transform(encoder, column): classes = encoder.classes_ return [encoder.transform([x])[0] if x in classes else -1 for x in column] # Convert input data to DataFrame input_data = { 'customer_name': customer_name, 'customer_address': customer_address, 'customer_phone_no': customer_phone, 'weight': float(weight), 'cod': int(cod), 'pickup_address':pickup_address, 'client_number':client_number, 'destination_city':destination_city } input_df = pd.DataFrame([input_data]) # Encode categorical variables using the same encoders used during training for col in input_df.columns: if col in encoders: input_df[col] = safe_transform(encoders[col], input_df[col]) # Predict and obtain probabilities pred = xgb_model.predict(input_df) pred_proba = xgb_model.predict_proba(input_df) import numpy as np from urllib.parse import unquote def extract_phone_numbers(customer_phone): # Decode URL-encoded phone numbers decoded_phone = unquote(customer_phone) # Split into a list of phone numbers phone_numbers = [phone.strip() for phone in decoded_phone.split('/')] # Handle case where there is a single phone number if len(phone_numbers) == 1 and phone_numbers[0]: return phone_numbers elif len(phone_numbers) == 0: return [] return phone_numbers def calculate_delivery_factor(phone_number): # Replace with the desired customer name and phone number #customer_phone_no = '0773224384' json = fetch_customer_data(phone_number) data = json #print(url,data) # Filter only relevant status names valid_statuses = ['Failed to Deliver', 'Delivered', 'Returned to Client'] relevant_orders = [order for order in data if order['status_name'] in valid_statuses] if not relevant_orders: base_probability = 0.50 else: delivered_count = sum(1 for order in relevant_orders if order['status_name'] == 'Delivered') total_orders_count = len(relevant_orders) base_probability = delivered_count / total_orders_count base_probability = max(0.05, min(base_probability, 0.95)) # Add a narrower random component random_component = np.random.uniform(-0.05, 0.05) adjusted_probability = base_probability + random_component return adjusted_probability try: print(customer_phone) phone_numbers = extract_phone_numbers(customer_phone) print(phone_numbers, "api calling ......") probability = calculate_delivery_factor(phone_numbers[0]) probability = round((probability * 100),2) #probability = f"{probability:.2f}" probability = f"{float(probability):.2f}" print(f"new model probability: {probability}") predicted_status = "delivered" # Output except Exception as e: print(f"Error: {e}") predicted_status = "Unknown" if pred[0] == -1 else encoders['status_name'].inverse_transform([pred])[0] probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown" print(str(predicted_status),probability) if probability>98: probability = probability-1 if predicted_status == "Returned to Client": probability = 100 - probability return {"Probability": round(probability,2),"predicted_status":predicted_status}