Spaces:

Abu1998
/

A2D

Sleeping

Abu1998 commited on Aug 15, 2024

Commit

b6788e9

verified ·

1 Parent(s): 16a8644

Update data_cleaning.py

Files changed (1) hide show

data_cleaning.py CHANGED Viewed

@@ -10,13 +10,19 @@ def clean_data(df):
     df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
     # Drop rows where 'Reply' column is missing
     df.dropna(subset=['Reply'], inplace=True)
     # Calculate comment and reply lengths
     df['comment_length'] = df['Comment'].str.len()
     df['reply_length'] = df['Reply'].str.len()
     # Remove duplicate rows
-    df = df.drop_duplicates()
-    return df

     df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
     # Drop rows where 'Reply' column is missing
+    before = df.shape[0]
     df.dropna(subset=['Reply'], inplace=True)
+    after = df.shape[0]
     # Calculate comment and reply lengths
     df['comment_length'] = df['Comment'].str.len()
     df['reply_length'] = df['Reply'].str.len()
     # Remove duplicate rows
+    num_duplicates = df.duplicated().sum()
+    df_deduplicated = df.drop_duplicates()
+    # Print number of duplicates
+    print('Number of duplicate rows:', num_duplicates)
+    return df_deduplicated