Spaces:
Runtime error
Runtime error
File size: 1,920 Bytes
a3e82d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import re
from pathlib import Path
import polars as pl
from sklearn.model_selection import train_test_split
def preprocess_data(data_dir:Path):
# Read the CSV file using Polars
df = pl.read_csv(data_dir / 'train.csv', new_columns=['polarity', 'title', 'text'])
assert df['polarity'].max()==2
assert df['polarity'].min()==1
# Drop rows with null values
df.drop_nulls()
# Map polarity to binary values (0 for negative, 1 for positive)
df = df.with_columns([
pl.col('polarity').apply(lambda x: 0 if x == 1 else 1)
])
# Cast polarity column to Int16
df = df.with_columns([
pl.col('polarity').cast(pl.Int16, strict=False)
])
# Combine title and text columns to create the review column
df = df.with_columns([
(pl.col('title') + ' ' + pl.col('text')).alias('review')
])
df = df.with_columns([
(pl.col('review').str().lower())
])
# Select relevant columns
df = df.select(['review', 'polarity'])
# Perform text cleaning using a function
df = df.with_columns([
pl.col('review').apply(clean_text)
])
df.write_csv(data_dir/'preprocessed_df.csv')
import re
import contractions
# Compile the regular expressions outside the function for better performance
PUNCTUATION_REGEX = re.compile(r'[^\w\s]')
DIGIT_REGEX = re.compile(r'\d')
SPECIAL_CHARACTERS_REGEX = re.compile(r'[#,@,&]')
MULTIPLE_SPACES_REGEX = re.compile(r'\s+')
def clean_text(x: str) -> str:
expanded_text = contractions.fix(x) # Expand contractions
text = PUNCTUATION_REGEX.sub(' ', expanded_text.lower()) # Remove punctuation after lowering
text = DIGIT_REGEX.sub('', text) # Remove digits
# Remove special characters (#,@,&)
text = SPECIAL_CHARACTERS_REGEX.sub('', text)
# Remove multiple spaces with single space
text = MULTIPLE_SPACES_REGEX.sub(' ', text)
return text.strip()
|