from datetime import datetime import json import sys import numpy as np import pandas as pd import math import time as sys_time from coding.llh.visualization.draw_boxplot import draw_boxplot from coding.llh.visualization.draw_heat_map import draw_heat_map from coding.llh.visualization.draw_histogram import draw_histogram from coding.llh.visualization.draw_histogram_line_subgraph import draw_histogram_line_subgraph from coding.llh.visualization.draw_line_graph import draw_line_graph from tqdm import tqdm # 0202: def data_transformation_extra(df: pd.DataFrame, str2int_mappings: dict) -> (pd.DataFrame): # Delete "match_id" column # df.drop("match_id", axis=1, inplace=True) df["match_id"] = df["match_id"].apply(lambda x: x[-4:]) # Dissolve the two-mode data mapping into two part value_to_replace_dict = { "AD": "50" } value_to_replace = "AD" df["p1_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True) df["p2_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True) str2int_mappings_to_dissolve = { "p1_score": {"0": 0}, "p2_score": {"0": 0} } df["p1_score_mark"] = 0 df["p2_score_mark"] = 0 for key in str2int_mappings_to_dissolve.keys(): for i in range(1, len(df)): if df.loc[i, key] == "15" and df.loc[i-1, key] == "0": df.loc[i, key+"_mark"] = 1 elif df.loc[i, key] == "1" and df.loc[i-1, key] == "0": df.loc[i, key + "_mark"] = 2 df["p1_score_normal"] = 0 df["p1_score_tiebreak"] = 0 df["p2_score_normal"] = 0 df["p2_score_tiebreak"] = 0 normal_counter = 0 tiebreak_counter = 0 for key in str2int_mappings_to_dissolve.keys(): for i in range(0, len(df)): if df.loc[i, key] == "0": normal_counter = 0 tiebreak_counter = 0 continue if df.loc[i, key+"_mark"] == 1 or normal_counter > 0: if int(df.loc[i, key]) > int(df.loc[i-1, key]): normal_counter += 1 df.loc[i, key + "_normal"] = normal_counter if df.loc[i, key] == value_to_replace_dict[value_to_replace]: str2int_mappings_to_dissolve[key][value_to_replace] = normal_counter else: str2int_mappings_to_dissolve[key][df.loc[i, key]] = normal_counter elif int(df.loc[i, key]) < int(df.loc[i-1, key]): normal_counter -= 1 df.loc[i, key + "_normal"] = normal_counter else: df.loc[i, key + "_normal"] = normal_counter elif df.loc[i, key+"_mark"] == 2 or tiebreak_counter > 0: if int(df.loc[i, key]) > int(df.loc[i - 1, key]): tiebreak_counter += 1 df.loc[i, key+"_tiebreak"] = tiebreak_counter if df.loc[i, key] == value_to_replace_dict[value_to_replace]: str2int_mappings_to_dissolve[key][value_to_replace] = tiebreak_counter else: str2int_mappings_to_dissolve[key][df.loc[i, key]] = tiebreak_counter elif int(df.loc[i, key]) < int(df.loc[i - 1, key]): tiebreak_counter -= 1 df.loc[i, key+"_tiebreak"] = tiebreak_counter else: df.loc[i, key + "_tiebreak"] = tiebreak_counter str2int_mappings.update(str2int_mappings_to_dissolve) df.drop("p1_score_mark", axis=1, inplace=True) df.drop("p2_score_mark", axis=1, inplace=True) df.drop("p1_score", axis=1, inplace=True) df.drop("p2_score", axis=1, inplace=True) # Transform "elapsed_time" time column def transform_time_col(time: str): h, m, s = time.strip().split(":") seconds = int(h) * 3600 + int(m) * 60 + int(s) return seconds df["elapsed_time"] = df["elapsed_time"].apply(transform_time_col) # Calculate "game_victor", "set_victor" column cumulative value df["p1_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 1 else 0, axis=1) df["p2_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 2 else 0, axis=1) df["p1_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 1 else 0, axis=1) df["p2_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 2 else 0, axis=1) df["p1_game_victor"] = df.groupby(["player1", "player2"])["p1_game_victor"].cumsum() df["p2_game_victor"] = df.groupby(["player1", "player2"])["p2_game_victor"].cumsum() df["p1_set_victor"] = df.groupby(["player1", "player2"])["p1_set_victor"].cumsum() df["p2_set_victor"] = df.groupby(["player1", "player2"])["p2_set_victor"].cumsum() # Forced conversion of data types for col in df.columns.values: df[col] = df[col].astype("float") # Save the mappings to a json format file with open("./data/mappings.json", "w", encoding="utf-8") as f: json.dump(str2int_mappings, f, indent=4, ensure_ascii=False) return df def data_transformation(df: pd.DataFrame) -> (pd.DataFrame, dict): """ 0. 1. Define mappings 2. Create mappings 3. Modify the original data according to the mappings 4. Get type exception 5. Forced conversion of data types """ info = {} # Define mappings str2int_mappings = { "player1": {}, "player2": {}, "winner_shot_type": {}, "serve_width": {}, "serve_depth": {}, "return_depth": {} } # Create mappings for col in str2int_mappings.copy(): keys = np.array(df[col].drop_duplicates()) values = [x for x in range(len(keys))] str2int_mappings[col] = dict(zip(keys, values)) # Modify the original data according to the mappings for col, mapping in str2int_mappings.items(): series = df[col] for k, v in mapping.items(): series.replace(k, v, inplace=True) df[col] = series df.replace('Not A Number', 0, inplace=True) # Get type exception # abnormal_type_values = [] # # for col in df.columns.values: # if col not in str2int_mappings.keys(): # for row in df[col]: # if not (0 <= row <= sys.maxsize): # abnormal_type_values.append(row) # # info["Number of abnormal type value"] = sorted(abnormal_type_values) # # Forced conversion of data types # for col in df.columns.values: # df[col] = df[col].astype("float") # # # Save the mappings to a json format file # with open("./mappings.json", "w", encoding="utf-8") as f: # json.dump(str2int_mappings, f, indent=4, ensure_ascii=False) # 0202: df = data_transformation_extra(df, str2int_mappings) return df, info # Get descriptive indicators and filtered data based on boxplpot def get_descriptive_indicators_related(df): info = {} descriptive_indicators_df = pd.DataFrame( index=list(df.columns.values), columns=[ "Min", "Max", "Avg", "Standard Deviation", "Standard Error", "Upper Quartile", "Median", "Lower Quartile", "Interquartile Distance", "Kurtosis", "Skewness", "Coefficient of Variation" ] ) for col in df.columns.values: descriptive_indicators_df["Min"][col] = df[col].min() descriptive_indicators_df["Max"][col] = df[col].max() descriptive_indicators_df["Avg"][col] = df[col].mean() descriptive_indicators_df["Standard Deviation"][col] = df[col].std() descriptive_indicators_df["Standard Error"][col] = descriptive_indicators_df["Standard Deviation"][col] / \ math.sqrt(len(df[col])) descriptive_indicators_df["Upper Quartile"][col] = df[col].quantile(0.75) descriptive_indicators_df["Median"][col] = df[col].quantile(0.5) descriptive_indicators_df["Lower Quartile"][col] = df[col].quantile(0.25) descriptive_indicators_df["Interquartile Distance"][col] = descriptive_indicators_df["Lower Quartile"][col] - \ descriptive_indicators_df["Upper Quartile"][col] descriptive_indicators_df["Kurtosis"][col] = df[col].kurt() descriptive_indicators_df["Skewness"][col] = df[col].skew() descriptive_indicators_df["Coefficient of Variation"][col] = descriptive_indicators_df["Standard Deviation"][ col] \ / descriptive_indicators_df["Avg"][col] # draw_heat_map(descriptive_indicators_df.to_numpy(), "descriptive indicators", True) # # draw_boxplot(df, "descriptive indicators boxplot") len_0 = len(df) # tmp_df = \ # df[(df >= (descriptive_indicators_df["Lower Quartile"] - 1.5 * (descriptive_indicators_df["Upper Quartile"] - # descriptive_indicators_df["Lower Quartile"]))) # & (df <= (descriptive_indicators_df["Upper Quartile"] + 1.5 * (descriptive_indicators_df["Upper Quartile"] - # descriptive_indicators_df["Lower Quartile"])))][[ # "ProductChoice", "MembershipPoints", "ModeOfPayment", "ResidentCity", "PurchaseTenure", "IncomeClass", # "CustomerPropensity", "CustomerAge", "LastPurchaseDuration" # ]] # tmp_df.dropna(inplace=True) # df = pd.concat([tmp_df, df[["ProductChoice", "Channel", "MartialStatus"]]], axis=1, join="inner") # df = pd.concat([df.iloc[:, :9], df.iloc[:, 10:]], axis=1) # info["Number of offsetting value"] = len_0 - len(df) # # info["Total size of filtered data after descriptive analysis"] = len(df) return df, info # Create images of the distribution of the number of each variable def variable_distribution(df): counts_mappings = {} print("counts analysis") for col in tqdm(df.columns.values, desc='columns:'): counts_mapping = {} for x in tqdm(df[col], desc='cells'): if x in counts_mapping.keys(): counts_mapping[x] += 1 else: counts_mapping[x] = 1 counts_mappings[col] = counts_mapping total_data_for_plot = [] print("plotting") for col, mapping in tqdm(counts_mappings.items(), desc='columns'): if col in ["set_no", 'game_no']: sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[0]) data = [x[1] for x in sorting] labels = [x[0] for x in sorting] total_data_for_plot.append(["line_graph", labels, data, col]) draw_line_graph(labels, data, col) else: sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[1]) data = [x[1] for x in sorting] labels = [x[0] for x in sorting] will_rotate = True if col in ["player1","player2", "match_id"] else False will_show_text = False if col in ["ResidentCity"] else True total_data_for_plot.append(["histogram", data, labels, will_rotate, will_show_text, col]) draw_histogram(data, labels, will_rotate, will_show_text, col) # draw_histogram_line_subgraph(total_data_for_plot)