Spaces:

ElijahDi
/

nlp

Sleeping

App Files Files Community

nlp / preprocessing.py

ElijahDi

Upload 11 files

ed0e769 verified 12 months ago

raw

history blame

3.37 kB

	import re
	import string
	import numpy as np
	import torch
	import nltk
	import pymorphy2

	from nltk.corpus import stopwords
	nltk.download('stopwords')
	stop_words = set(stopwords.words('russian'))
	morph = pymorphy2.MorphAnalyzer()

	def data_preprocessing_hard(text: str) -> str:
	text = text.lower()
	text = re.sub('<.*?>', '', text)
	text = re.sub(r'[^а-яА-Я\s]', '', text)
	text = ''.join([c for c in text if c not in string.punctuation])
	text = ' '.join([word for word in text.split() if word not in stop_words])
	# text = ''.join([char for char in text if not char.isdigit()])
	text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])

	return text

	def data_preprocessing(text: str) -> str:
	"""preprocessing string: lowercase, removing html-tags, punctuation and stopwords

	Args:
	text (str): input string for preprocessing

	Returns:
	str: preprocessed string
	"""

	text = text.lower()
	text = re.sub('<.*?>', '', text) # html tags
	text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
	text = [word for word in text.split() if word not in stop_words]
	text = ' '.join(text)
	return text

	def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
	return list(filter(lambda x: x[1] > n, sorted_words))

	def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
	"""Make left-sided padding for input list of tokens

	Args:
	review_int (list): input list of tokens
	seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros

	Returns:
	np.array: padded sequences
	"""
	features = np.zeros((len(review_int), seq_len), dtype = int)
	for i, review in enumerate(review_int):
	if len(review) <= seq_len:
	zeros = list(np.zeros(seq_len - len(review)))
	new = zeros + review
	else:
	new = review[: seq_len]
	features[i, :] = np.array(new)

	return features

	def preprocess_single_string(
	input_string: str,
	seq_len: int,
	vocab_to_int: dict,
	verbose : bool = False
	) -> torch.tensor:
	"""Function for all preprocessing steps on a single string

	Args:
	input_string (str): input single string for preprocessing
	seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
	vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.

	Returns:
	list: preprocessed string
	"""

	preprocessed_string = data_preprocessing(input_string)
	result_list = []
	for word in preprocessed_string.split():
	try:
	result_list.append(vocab_to_int[word])
	except KeyError as e:
	if verbose:
	print(f'{e}: not in dictionary!')
	pass
	result_padded = padding([result_list], seq_len)[0]

	return torch.tensor(result_padded)

	def predict_review(model, review_text: str, net_config, vocab_to_int) -> torch.tensor:
	sample = preprocess_single_string(review_text, net_config.seq_len, vocab_to_int)
	probability_lstm = model(sample.unsqueeze(0)).to(net_config.device).sigmoid()
	return probability_lstm.item()