derek-thomas
commited on
Commit
·
76a52b4
1
Parent(s):
9a66c2f
Updating merge function to show if a row was updated
Browse files- utilities/data_collator.py +18 -4
utilities/data_collator.py
CHANGED
@@ -13,7 +13,8 @@ def get_latest_data():
|
|
13 |
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
"""
|
15 |
For each id, creates a new row with the longest content and the highest score
|
16 |
-
from the available rows with the same id.
|
|
|
17 |
|
18 |
Parameters:
|
19 |
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
|
@@ -21,15 +22,18 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
21 |
Returns:
|
22 |
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
|
23 |
with the longest content available and the highest score from
|
24 |
-
potentially different rows.
|
25 |
"""
|
26 |
|
|
|
|
|
|
|
27 |
# Create a column for content length
|
28 |
df['content_length'] = df['content'].str.len()
|
29 |
|
30 |
# Find row with the longest content for each 'id'
|
31 |
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
|
32 |
-
df_longest_content = df.loc[idx_longest_content]
|
33 |
|
34 |
# Find row with the highest score for each 'id'
|
35 |
idx_highest_score = df.groupby('id')['score'].idxmax().values
|
@@ -38,9 +42,19 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
|
38 |
# Merge the two DataFrames on 'id'
|
39 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
40 |
|
41 |
-
return df_merged
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
|
|
|
|
|
|
|
|
44 |
|
45 |
|
46 |
def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
|
|
13 |
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
"""
|
15 |
For each id, creates a new row with the longest content and the highest score
|
16 |
+
from the available rows with the same id. Adds a boolean column 'updated'
|
17 |
+
indicating whether the row was updated.
|
18 |
|
19 |
Parameters:
|
20 |
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
|
|
|
22 |
Returns:
|
23 |
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
|
24 |
with the longest content available and the highest score from
|
25 |
+
potentially different rows, and a boolean column 'updated'.
|
26 |
"""
|
27 |
|
28 |
+
# Create a copy of the original DataFrame to avoid modifying it directly
|
29 |
+
original_df = df.copy()
|
30 |
+
|
31 |
# Create a column for content length
|
32 |
df['content_length'] = df['content'].str.len()
|
33 |
|
34 |
# Find row with the longest content for each 'id'
|
35 |
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
|
36 |
+
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
|
37 |
|
38 |
# Find row with the highest score for each 'id'
|
39 |
idx_highest_score = df.groupby('id')['score'].idxmax().values
|
|
|
42 |
# Merge the two DataFrames on 'id'
|
43 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
44 |
|
|
|
45 |
|
46 |
+
# Check if the content or score was updated for each id
|
47 |
+
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
|
48 |
+
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
|
49 |
+
df_merged['score'] != df_merged['score_original'])
|
50 |
+
|
51 |
+
# Drop duplicates to keep only the rows with longest content and highest score
|
52 |
+
df_merged.drop_duplicates(subset='id', inplace=True)
|
53 |
|
54 |
+
# Drop original content and score columns
|
55 |
+
df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
|
56 |
+
|
57 |
+
return df_merged
|
58 |
|
59 |
|
60 |
def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|