Spaces:
Sleeping
Sleeping
File size: 6,742 Bytes
8ff45d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import logging
from pathlib import Path
from typing import Union, Optional
import pandas as pd
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class DataPreprocessor:
"""A class to handle data preprocessing operations for different file formats."""
@staticmethod
def _preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""
Applies standard preprocessing steps to a DataFrame.
Args:
df (pd.DataFrame): Input DataFrame to preprocess
Returns:
pd.DataFrame: Preprocessed DataFrame
"""
try:
# Convert text columns to lowercase for standardization
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
# Drop columns that are fully null
df = df.dropna(axis=1, how='all')
# Fill remaining NaN values with empty strings
df = df.fillna('')
# Remove duplicate rows
df = df.drop_duplicates()
return df
except Exception as e:
logger.error(f"Error during DataFrame preprocessing: {str(e)}")
raise
@classmethod
def preprocess_msd(cls,
file_path: Union[str, Path],
output_path: Union[str, Path],
sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame:
"""
Preprocesses an MSD Excel file and saves the result.
Args:
file_path: Path to the Excel file
output_path: Directory path for the output file
sheet_name: Sheet name or index to load (default: 0)
Returns:
pd.DataFrame: Preprocessed DataFrame
Raises:
FileNotFoundError: If input file doesn't exist
PermissionError: If output directory is not writable
"""
try:
# Convert to Path objects
file_path = Path(file_path)
output_path = Path(output_path)
# Validate input file
if not file_path.exists():
raise FileNotFoundError(f"Input file not found: {file_path}")
# Ensure output directory exists
output_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Processing MSD file: {file_path}")
df = pd.read_excel(file_path, sheet_name=sheet_name)
# Apply preprocessing
df = cls._preprocess_dataframe(df)
# Save processed file
output_file = output_path / "msd_processed.csv"
df.to_csv(output_file, index=False)
logger.info(f"Saved processed file to: {output_file}")
return df
except Exception as e:
logger.error(f"Error processing MSD file: {str(e)}")
raise
@classmethod
def preprocess_cbip(cls,
input_dir: Union[str, Path],
output_dir: Union[str, Path]) -> None:
"""
Preprocesses all CSV files in the CBIP directory.
Args:
input_dir: Directory containing input CSV files
output_dir: Directory for output files
Raises:
FileNotFoundError: If input directory doesn't exist
PermissionError: If output directory is not writable
"""
try:
# Convert to Path objects
input_dir = Path(input_dir)
output_dir = Path(output_dir)
# Validate input directory
if not input_dir.exists():
raise FileNotFoundError(f"Input directory not found: {input_dir}")
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# Process all CSV files
csv_files = list(input_dir.rglob("*.csv"))
if not csv_files:
logger.warning(f"No CSV files found in: {input_dir}")
return
for file_path in csv_files:
try:
logger.info(f"Processing CBIP file: {file_path}")
# Read CSV file
df = pd.read_csv(
file_path,
delimiter=';',
quotechar='"',
skip_blank_lines=True
)
# Apply preprocessing
df = cls._preprocess_dataframe(df)
# Save processed file
output_file = output_dir / file_path.name
df.to_csv(output_file, index=False)
logger.info(f"Saved processed file to: {output_file}")
except Exception as e:
logger.error(f"Error processing {file_path}: {str(e)}")
continue
except Exception as e:
logger.error(f"Error processing CBIP directory: {str(e)}")
raise
def main():
"""Main execution function."""
try:
import os
import argparse
from pathlib import Path
# Create processed_data directory in current working directory
output_base = Path.cwd() / "processed_data"
msd_output = output_base / "msd"
cbip_output = output_base / "cbip"
parser = argparse.ArgumentParser(description='Process MSD and CBIP data files.')
parser.add_argument('--msd-input', required=True, help='Path to MSD Excel file')
parser.add_argument('--cbip-input', required=True, help='Input directory containing CBIP CSV files')
args = parser.parse_args()
preprocessor = DataPreprocessor()
# Process MSD file
preprocessor.preprocess_msd(
args.msd_input,
msd_output
)
# Process CBIP directory
preprocessor.preprocess_cbip(
args.cbip_input,
cbip_output
)
except Exception as e:
logger.error(f"Main execution failed: {str(e)}")
raise
if __name__ == "__main__":
main() |