Spaces:
Runtime error
Runtime error
import re | |
from pathlib import Path | |
import polars as pl | |
from sklearn.model_selection import train_test_split | |
def preprocess_data(data_dir:Path): | |
# Read the CSV file using Polars | |
df = pl.read_csv(data_dir / 'train.csv', new_columns=['polarity', 'title', 'text']) | |
assert df['polarity'].max()==2 | |
assert df['polarity'].min()==1 | |
# Drop rows with null values | |
df.drop_nulls() | |
# Map polarity to binary values (0 for negative, 1 for positive) | |
df = df.with_columns([ | |
pl.col('polarity').apply(lambda x: 0 if x == 1 else 1) | |
]) | |
# Cast polarity column to Int16 | |
df = df.with_columns([ | |
pl.col('polarity').cast(pl.Int16, strict=False) | |
]) | |
# Combine title and text columns to create the review column | |
df = df.with_columns([ | |
(pl.col('title') + ' ' + pl.col('text')).alias('review') | |
]) | |
df = df.with_columns([ | |
(pl.col('review').str().lower()) | |
]) | |
# Select relevant columns | |
df = df.select(['review', 'polarity']) | |
# Perform text cleaning using a function | |
df = df.with_columns([ | |
pl.col('review').apply(clean_text) | |
]) | |
df.write_csv(data_dir/'preprocessed_df.csv') | |
import re | |
import contractions | |
# Compile the regular expressions outside the function for better performance | |
PUNCTUATION_REGEX = re.compile(r'[^\w\s]') | |
DIGIT_REGEX = re.compile(r'\d') | |
SPECIAL_CHARACTERS_REGEX = re.compile(r'[#,@,&]') | |
MULTIPLE_SPACES_REGEX = re.compile(r'\s+') | |
def clean_text(x: str) -> str: | |
expanded_text = contractions.fix(x) # Expand contractions | |
text = PUNCTUATION_REGEX.sub(' ', expanded_text.lower()) # Remove punctuation after lowering | |
text = DIGIT_REGEX.sub('', text) # Remove digits | |
# Remove special characters (#,@,&) | |
text = SPECIAL_CHARACTERS_REGEX.sub('', text) | |
# Remove multiple spaces with single space | |
text = MULTIPLE_SPACES_REGEX.sub(' ', text) | |
return text.strip() | |