Abu1998 commited on
Commit
b6788e9
·
verified ·
1 Parent(s): 16a8644

Update data_cleaning.py

Browse files
Files changed (1) hide show
  1. data_cleaning.py +8 -2
data_cleaning.py CHANGED
@@ -10,13 +10,19 @@ def clean_data(df):
10
  df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
11
 
12
  # Drop rows where 'Reply' column is missing
 
13
  df.dropna(subset=['Reply'], inplace=True)
 
14
 
15
  # Calculate comment and reply lengths
16
  df['comment_length'] = df['Comment'].str.len()
17
  df['reply_length'] = df['Reply'].str.len()
18
 
19
  # Remove duplicate rows
20
- df = df.drop_duplicates()
 
21
 
22
- return df
 
 
 
 
10
  df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
11
 
12
  # Drop rows where 'Reply' column is missing
13
+ before = df.shape[0]
14
  df.dropna(subset=['Reply'], inplace=True)
15
+ after = df.shape[0]
16
 
17
  # Calculate comment and reply lengths
18
  df['comment_length'] = df['Comment'].str.len()
19
  df['reply_length'] = df['Reply'].str.len()
20
 
21
  # Remove duplicate rows
22
+ num_duplicates = df.duplicated().sum()
23
+ df_deduplicated = df.drop_duplicates()
24
 
25
+ # Print number of duplicates
26
+ print('Number of duplicate rows:', num_duplicates)
27
+
28
+ return df_deduplicated