Spaces:
Sleeping
Sleeping
File size: 11,460 Bytes
6d776bd 1a02e7a 6d776bd 1a02e7a 6d776bd 1a02e7a 6d776bd 1a02e7a 6d776bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
import pandas as pd
import re
from settings import *
# from utils_fcts import print_df_shape # laisser commenté : importé dans utils_fcts ; erreru import circulaire
def clean_cols(myl):
# myl = [re.sub(r'\[.*\]', '', x) for x in myl]
myl = [re.sub(r'[-\[\]\+ \(\)°%]', '_', x.strip()) for x in myl]
myl = [re.sub(r'_+', '_', x) for x in myl]
myl = [x.strip("_") for x in myl]
return myl
def getheadercols(r1, r2, r3):
# les headers ont un champ de moins que les valeurs
l1 = r1.split(';') #+ ["missing"]
l2 = r2.split(';') #+ [""]
l3 = r3.split(';') #+ [""]
assert len(l1) == len(l2)
assert len(l1) == len(l3)
for i in range(1, len(l1)):
if l1[i] == '':
l1[i] = l1[i - 1]
# enlever caractères spéciaux et unité
l1 = clean_cols(l1)
# Concatenate the first three rows to form new column names
new_columns = [f'{l1[i]}_{l2[i]}_{l3[i]}' for i in range(len(l1))]
new_columns = [re.sub(r'_+', '_', x) for x in new_columns]
new_columns[0] = db_timecol
return clean_cols(new_columns)
def file2tables(file_path):
error_msg = ""
ok_msg = ""
# Détection du type d'entrée : fichier ou chemin de fichier
if isinstance(file_path, str): # Si c'est un chemin de fichier
is_file_obj = False
f = open(file_path, encoding='latin1')
filepathlab = file_path
elif isinstance(file_path, io.StringIO): # Si c'est un objet fichier
is_file_obj = True
f = file_path # Utiliser l'objet de fichier directement
filepathlab = "file content"
else:
raise ValueError("file_input must be either a file path or an io.StringIO object.")
print("> START processing " + filepathlab)
time_pattern = re.compile(r'^\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}$')
dayP_pattern = re.compile(r'^P\d.+')
dayI_pattern = re.compile(r'^I\d.+')
#with open(file_path, encoding='latin1') as f:
try:
print(">>> start reading content")
# Lire les 3 lignes du header
header_lines = [f.readline().strip().rstrip(";") for _ in range(3)]
new_columns = getheadercols(header_lines[0], header_lines[1], header_lines[2])
time_lines_init = [f.readline().strip().rstrip(";").split(";") for _ in
range(60 * 24)]
curr_day = time_lines_init[0][0].split(' ')[0]
# pas nécessaire de mettre range(3, 3 + 60 * 24)] ! _ est le pointeur
time_lines = [x for x in time_lines_init if len(x) == len(new_columns) and
time_pattern.match(x[0]) and
re.compile(curr_day).match(x[0])]
maxTime_idx = max(i for i, x in enumerate(time_lines_init) if len(x) ==
len(new_columns) and time_pattern.match(x[0]))
dayPstart_idx = maxTime_idx + nHeaderSkip + 1
# repositionner le pointeur
f.seek(0)
for _ in range(dayPstart_idx):
f.readline()
dayP_lines_init = [f.readline().strip().rstrip(";").split(";") for _ in
range(nRowsDayP)]
# pas nécessaire dayPstart_idx,(dayPstart_idx+nRowsDayP))
dayP_lines = [x for x in dayP_lines_init if dayP_pattern.match(x[0])]
maxDayP_idx = max(i for i, x in enumerate(dayP_lines_init) if
dayP_pattern.match(x[0]))
dayIstart_idx = maxDayP_idx + dayPstart_idx + 1
# repositionner le pointeur
f.seek(0)
for _ in range(dayIstart_idx):
f.readline()
dayI_lines_init = [f.readline().strip().rstrip(";").split(";") for _ in
range(nRowsDayI)]
# pas nécessaire range(dayIstart_idx,(dayIstart_idx+nRowsDayI)
dayI_lines = [x for x in dayI_lines_init if dayI_pattern.match(x[0])]
finally:
if not is_file_obj:
f.close() # Ne fermer que si nous avons ouvert le fichier
###############################################################
##################### EXTRAIRE LES DONNÉES TIME
###############################################################
time_data = pd.DataFrame(time_lines)#, columns=new_columns)
# la 1ère colonne en datetime
time_data.loc[:, 0] = pd.to_datetime(time_data.iloc[:, 0],
format='%d.%m.%Y %H:%M').dt.strftime('%Y-%m-%d %H:%M:%S')
# Conversion des autres colonnes en numérique (float)
for col in range(1, time_data.shape[1]):
time_data.iloc[:, col] = pd.to_numeric(time_data.iloc[:, col], errors='coerce')
#time_data.to_pickle('time_data_v3.pkl')
ntime, ncols = time_data.shape
time_data.columns = new_columns
if ntime < 60 * 24:
error_msg += filepathlab + " - WARNING : missing 'time' data (available : " + str(ntime) + "/" + str(60 * 24)
elif ntime == 60 * 24:
ok_msg += filepathlab + " - SUCCESS reading 'time' data "
print(','.join(list(set(time_real_cols + time_txt_cols)-set(time_data.columns))))
assert time_data.columns.isin(time_real_cols + time_txt_cols).all()
time_missingcols = list(set(time_real_cols + time_txt_cols) -
set(time_data.columns))
if len(time_missingcols) > 0:
error_msg += "\n"+ filepathlab + " - WARNING : missing 'time' data columns (" + ','.join(time_missingcols) + ")\n"
else:
ok_msg += "\n"+ filepathlab + " - SUCCESS found all 'time' data columns\n"
date_day = pd.to_datetime(curr_day, format='%d.%m.%Y').strftime('%Y-%m-%d')
###############################################################
##################### EXTRAIRE LES DONNÉES DAY P
###############################################################
dayP_dataL = pd.DataFrame(dayP_lines)
dayP_datam = dayP_dataL.melt(id_vars=[0],
value_vars=list(dayP_dataL.columns),
var_name='variable',
value_name='value')
dayP_datam.iloc[:, 0] = dayP_datam.iloc[:, 0] + "_" + dayP_datam.iloc[:, 1].astype(str)
dayP_datam.drop(columns=['variable'], inplace=True)
dayP_data = dayP_datam.T
dayP_data.columns = dayP_data.iloc[0]
dayP_data = dayP_data[1:]
dayP_data.insert(0, day_txt_cols[0], date_day)
ndayP, ndayPcols = dayP_data.shape
assert ndayP <= 1
if ndayP == 0:
error_msg += "\n"+ filepathlab + " - WARNING : no 'dayP' data found"
else :
ok_msg += filepathlab + " - SUCCESS reading 'dayP' data "
assert ndayPcols <= len(dayP_real_cols) + len(day_txt_cols)
dayP_missingcols = list(set(dayP_real_cols + day_txt_cols) - set(dayP_data.columns))
if len(dayP_missingcols) > 0:
error_msg += "\n"+ filepathlab + " - WARNING : missing 'dayP' data columns (" + \
','.join(dayP_missingcols) + ")\n"
else:
ok_msg += "\n"+ filepathlab + " - SUCCESS found all 'dayP' data columns\n"
###############################################################
##################### EXTRAIRE LES DONNÉES DAY I
###############################################################
dayI_dataL = pd.DataFrame(dayI_lines)
dayI_datam = dayI_dataL.melt(id_vars=[0],
value_vars=list(dayI_dataL.columns),
var_name='variable',
value_name='value')
dayI_datam.iloc[:, 0] = dayI_datam.iloc[:, 0] + "_" + dayI_datam.iloc[:, 1].astype(str)
dayI_datam.drop(columns=['variable'], inplace=True)
dayI_data = dayI_datam.T
dayI_data.columns = dayI_data.iloc[0]
dayI_data = dayI_data[1:]
dayI_data.insert(0, day_txt_cols[0], date_day)
# Conversion des autres colonnes en numérique (float)
for col in range(1, dayI_data.shape[1]):
dayI_data.iloc[:, col] = pd.to_numeric(dayI_data.iloc[:, col], errors='coerce')
ndayI, ndayIcols = dayI_data.shape
assert ndayI <= 1
if ndayI == 0:
error_msg += "\n"+ filepathlab + " - WARNING : no 'dayI' data found\n"
else :
ok_msg += filepathlab + " - SUCCESS reading 'dayI' data\n"
assert ndayIcols <= len(dayI_real_cols) + len(day_txt_cols)
dayI_missingcols = list(set(dayI_real_cols + day_txt_cols) - set(dayI_data.columns))
if len(dayI_missingcols) > 0:
error_msg += "\n"+ filepathlab + " - WARNING : missing 'dayI' data columns (" + \
','.join(dayI_missingcols) + ")\n"
else:
ok_msg += "\n"+ filepathlab + " - SUCCESS found all 'dayI' data columns\n"
for col in time_data.columns:
if col.endswith('_L1'):
col_l2 = col.replace('_L1', '_L2')
if col_l2 in time_data.columns:
new_col = col.replace('_L1', '_L1_L2')
if not new_col in time_data.columns:
time_data[new_col] = time_data[col] + time_data[col_l2]
return {"time_data" : time_data,
"dayP_data" : dayP_data,
"dayI_data" : dayI_data,
"error" : error_msg,
"success" : ok_msg}
def create_and_insert(timeData, dayiData, daypData=None):
# Connexion à la base de données SQLite
conn = sqlite3.connect(db_file)
c = conn.cursor()
# Créer les tables si pas existant
c.execute('''CREATE TABLE IF NOT EXISTS ''' + dbTime_name + "(" +
','.join([x + " TEXT" for x in time_txt_cols]) + "," +
','.join([x + " REAL" for x in time_real_cols + time_added_cols]) + '''
)''')
if daypData:
c.execute('''CREATE TABLE IF NOT EXISTS ''' + dbDayP_name + "(" +
','.join([x + " TEXT" for x in day_txt_cols]) + "," +
','.join([x + " REAL" for x in dayP_real_cols]) + '''
)''')
c.execute('''CREATE TABLE IF NOT EXISTS ''' + dbDayI_name + "(" +
','.join([x + " TEXT" for x in day_txt_cols]) + "," +
','.join([x + " REAL" for x in dayI_real_cols]) + '''
)''')
conn.commit()
# Insérer les données dans la base de données
timeData.to_sql(dbTime_name, conn,
if_exists='append', index=False)
if daypData:
daypData.to_sql(dbDayP_name, conn,
if_exists='append', index=False)
dayiData.to_sql(dbDayI_name, conn,
if_exists='append', index=False)
# Il est important de fermer la connexion une fois que toutes les opérations sont complétées
conn.close()
def create_and_concat(timeData, dayiData,currentTimeData,
currentDayiData, currentDaypData = None,daypData=None):
out_txt = ""
out_txt += "receive time data to add : " + print_df_shape(timeData)
out_txt += "receive dayI data to add : " + print_df_shape(dayiData)
### pas implémnté pour dayp
if currentTimeData :
newTimeData = pd.concat([timeData, currentTimeData], axis=0)
else:
newTimeData = timeData
if currentDayiData:
newDayiData = pd.concat([dayiData, currentDayiData], axis=0)
else:
newDayiData = dayiData
out_txt += "return updated time data : " + print_df_shape(newTimeData)
out_txt += "return updated dayI data : " + print_df_shape(newDayiData)
return{"new_dayI_data" : newDayiData,
"new_time_data": newTimeData,
"msg" : out_txt
}
|