amazon-review-sentiment-analysis / src /data_preprocessing.py
tikendraw's picture
application
a3e82d3
raw
history blame
1.92 kB
import re
from pathlib import Path
import polars as pl
from sklearn.model_selection import train_test_split
def preprocess_data(data_dir:Path):
# Read the CSV file using Polars
df = pl.read_csv(data_dir / 'train.csv', new_columns=['polarity', 'title', 'text'])
assert df['polarity'].max()==2
assert df['polarity'].min()==1
# Drop rows with null values
df.drop_nulls()
# Map polarity to binary values (0 for negative, 1 for positive)
df = df.with_columns([
pl.col('polarity').apply(lambda x: 0 if x == 1 else 1)
])
# Cast polarity column to Int16
df = df.with_columns([
pl.col('polarity').cast(pl.Int16, strict=False)
])
# Combine title and text columns to create the review column
df = df.with_columns([
(pl.col('title') + ' ' + pl.col('text')).alias('review')
])
df = df.with_columns([
(pl.col('review').str().lower())
])
# Select relevant columns
df = df.select(['review', 'polarity'])
# Perform text cleaning using a function
df = df.with_columns([
pl.col('review').apply(clean_text)
])
df.write_csv(data_dir/'preprocessed_df.csv')
import re
import contractions
# Compile the regular expressions outside the function for better performance
PUNCTUATION_REGEX = re.compile(r'[^\w\s]')
DIGIT_REGEX = re.compile(r'\d')
SPECIAL_CHARACTERS_REGEX = re.compile(r'[#,@,&]')
MULTIPLE_SPACES_REGEX = re.compile(r'\s+')
def clean_text(x: str) -> str:
expanded_text = contractions.fix(x) # Expand contractions
text = PUNCTUATION_REGEX.sub(' ', expanded_text.lower()) # Remove punctuation after lowering
text = DIGIT_REGEX.sub('', text) # Remove digits
# Remove special characters (#,@,&)
text = SPECIAL_CHARACTERS_REGEX.sub('', text)
# Remove multiple spaces with single space
text = MULTIPLE_SPACES_REGEX.sub(' ', text)
return text.strip()