Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas commited on Nov 12, 2023

Commit

76a52b4

1 Parent(s): 9a66c2f

Updating merge function to show if a row was updated

Browse files

Files changed (1) hide show

utilities/data_collator.py +18 -4

utilities/data_collator.py CHANGED Viewed

@@ -13,7 +13,8 @@ def get_latest_data():
 def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     """
     For each id, creates a new row with the longest content and the highest score
-    from the available rows with the same id.
     Parameters:
     - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
@@ -21,15 +22,18 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
     - pd.DataFrame: A DataFrame with unique ids, where each id is associated
                     with the longest content available and the highest score from
-                    potentially different rows.
     """
     # Create a column for content length
     df['content_length'] = df['content'].str.len()
     # Find row with the longest content for each 'id'
     idx_longest_content = df.groupby('id')['content_length'].idxmax().values
-    df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])
     # Find row with the highest score for each 'id'
     idx_highest_score = df.groupby('id')['score'].idxmax().values
@@ -38,9 +42,19 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
-    return df_merged
 def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:

 def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
     """
     For each id, creates a new row with the longest content and the highest score
+    from the available rows with the same id. Adds a boolean column 'updated'
+    indicating whether the row was updated.
     Parameters:
     - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
     Returns:
     - pd.DataFrame: A DataFrame with unique ids, where each id is associated
                     with the longest content available and the highest score from
+                    potentially different rows, and a boolean column 'updated'.
     """
+    # Create a copy of the original DataFrame to avoid modifying it directly
+    original_df = df.copy()
     # Create a column for content length
     df['content_length'] = df['content'].str.len()
     # Find row with the longest content for each 'id'
     idx_longest_content = df.groupby('id')['content_length'].idxmax().values
+    df_longest_content = df.loc[idx_longest_content][['id', 'content']]
     # Find row with the highest score for each 'id'
     idx_highest_score = df.groupby('id')['score'].idxmax().values
     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
+    # Check if the content or score was updated for each id
+    df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
+    df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
+            df_merged['score'] != df_merged['score_original'])
+    # Drop duplicates to keep only the rows with longest content and highest score
+    df_merged.drop_duplicates(subset='id', inplace=True)
+    # Drop original content and score columns
+    df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
+    return df_merged
 def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame: