Spaces:

shamim237
/

artech-med-bot

Sleeping

File size: 6,742 Bytes

8ff45d7

import logging
from pathlib import Path
from typing import Union, Optional
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class DataPreprocessor:
    """A class to handle data preprocessing operations for different file formats."""
    
    @staticmethod
    def _preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
        """

        Applies standard preprocessing steps to a DataFrame.

        

        Args:

            df (pd.DataFrame): Input DataFrame to preprocess

            

        Returns:

            pd.DataFrame: Preprocessed DataFrame

        """
        try:
            # Convert text columns to lowercase for standardization
            df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
            
            # Drop columns that are fully null
            df = df.dropna(axis=1, how='all')
            
            # Fill remaining NaN values with empty strings
            df = df.fillna('')
            
            # Remove duplicate rows
            df = df.drop_duplicates()
            
            return df
            
        except Exception as e:
            logger.error(f"Error during DataFrame preprocessing: {str(e)}")
            raise

    @classmethod
    def preprocess_msd(cls, 

                      file_path: Union[str, Path], 

                      output_path: Union[str, Path],

                      sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame:
        """

        Preprocesses an MSD Excel file and saves the result.

        

        Args:

            file_path: Path to the Excel file

            output_path: Directory path for the output file

            sheet_name: Sheet name or index to load (default: 0)

            

        Returns:

            pd.DataFrame: Preprocessed DataFrame

            

        Raises:

            FileNotFoundError: If input file doesn't exist

            PermissionError: If output directory is not writable

        """
        try:
            # Convert to Path objects
            file_path = Path(file_path)
            output_path = Path(output_path)
            
            # Validate input file
            if not file_path.exists():
                raise FileNotFoundError(f"Input file not found: {file_path}")
            
            # Ensure output directory exists
            output_path.mkdir(parents=True, exist_ok=True)
            
            logger.info(f"Processing MSD file: {file_path}")
            df = pd.read_excel(file_path, sheet_name=sheet_name)
            
            # Apply preprocessing
            df = cls._preprocess_dataframe(df)
            
            # Save processed file
            output_file = output_path / "msd_processed.csv"
            df.to_csv(output_file, index=False)
            logger.info(f"Saved processed file to: {output_file}")
            
            return df
            
        except Exception as e:
            logger.error(f"Error processing MSD file: {str(e)}")
            raise

    @classmethod
    def preprocess_cbip(cls,

                       input_dir: Union[str, Path],

                       output_dir: Union[str, Path]) -> None:
        """

        Preprocesses all CSV files in the CBIP directory.

        

        Args:

            input_dir: Directory containing input CSV files

            output_dir: Directory for output files

            

        Raises:

            FileNotFoundError: If input directory doesn't exist

            PermissionError: If output directory is not writable

        """
        try:
            # Convert to Path objects
            input_dir = Path(input_dir)
            output_dir = Path(output_dir)
            
            # Validate input directory
            if not input_dir.exists():
                raise FileNotFoundError(f"Input directory not found: {input_dir}")
            
            # Ensure output directory exists
            output_dir.mkdir(parents=True, exist_ok=True)
            
            # Process all CSV files
            csv_files = list(input_dir.rglob("*.csv"))
            if not csv_files:
                logger.warning(f"No CSV files found in: {input_dir}")
                return
            
            for file_path in csv_files:
                try:
                    logger.info(f"Processing CBIP file: {file_path}")
                    
                    # Read CSV file
                    df = pd.read_csv(
                        file_path,
                        delimiter=';',
                        quotechar='"',
                        skip_blank_lines=True
                    )
                    
                    # Apply preprocessing
                    df = cls._preprocess_dataframe(df)
                    
                    # Save processed file
                    output_file = output_dir / file_path.name
                    df.to_csv(output_file, index=False)
                    logger.info(f"Saved processed file to: {output_file}")
                    
                except Exception as e:
                    logger.error(f"Error processing {file_path}: {str(e)}")
                    continue
                    
        except Exception as e:
            logger.error(f"Error processing CBIP directory: {str(e)}")
            raise

def main():
    """Main execution function."""
    try:
        import os
        import argparse
        from pathlib import Path
        
        # Create processed_data directory in current working directory
        output_base = Path.cwd() / "processed_data"
        msd_output = output_base / "msd"
        cbip_output = output_base / "cbip"
        
        parser = argparse.ArgumentParser(description='Process MSD and CBIP data files.')
        parser.add_argument('--msd-input', required=True, help='Path to MSD Excel file')
        parser.add_argument('--cbip-input', required=True, help='Input directory containing CBIP CSV files')
        
        args = parser.parse_args()
        
        preprocessor = DataPreprocessor()
        
        # Process MSD file
        preprocessor.preprocess_msd(
            args.msd_input,
            msd_output
        )
        
        # Process CBIP directory
        preprocessor.preprocess_cbip(
            args.cbip_input,
            cbip_output
        )
        
    except Exception as e:
        logger.error(f"Main execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()