|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
import tensorflow as tf |
|
from tokenizers import Tokenizer |
|
from tokenizers.models import BPE |
|
from tokenizers.pre_tokenizers import Whitespace |
|
from tokenizers.trainers import BpeTrainer |
|
import numpy as np |
|
|
|
def preprocess(): |
|
|
|
data = pd.read_csv('English_To_Klingon.csv') |
|
|
|
|
|
data['english'] = data['english'].apply(lambda x: '<BOS> ' + x + ' <EOS>') |
|
|
|
|
|
english_sentences = data['english'].values |
|
klingon_sentences = data['klingon'].values |
|
|
|
|
|
english_train, english_test, klingon_train, klingon_test = train_test_split( |
|
english_sentences, klingon_sentences, test_size=0.2, random_state=42) |
|
|
|
|
|
english_tokenizer = Tokenizer(BPE()) |
|
english_tokenizer.pre_tokenizer = Whitespace() |
|
english_trainer = BpeTrainer(special_tokens=["<UNK>", "<BOS>", "<EOS>"]) |
|
english_tokenizer.train_from_iterator(english_train, trainer=english_trainer) |
|
|
|
|
|
klingon_tokenizer = Tokenizer(BPE()) |
|
klingon_tokenizer.pre_tokenizer = Whitespace() |
|
klingon_trainer = BpeTrainer(special_tokens=["<UNK>", "<BOS>", "<EOS>"]) |
|
klingon_tokenizer.train_from_iterator(klingon_train, trainer=klingon_trainer) |
|
|
|
|
|
english_train_sequences = [english_tokenizer.encode(sent).ids for sent in english_train] |
|
klingon_train_sequences = [klingon_tokenizer.encode(sent).ids for sent in klingon_train] |
|
english_test_sequences = [english_tokenizer.encode(sent).ids for sent in english_test] |
|
klingon_test_sequences = [klingon_tokenizer.encode(sent).ids for sent in klingon_test] |
|
|
|
|
|
max_length_english = max(max(len(seq) for seq in english_train_sequences), max(len(seq) for seq in english_test_sequences)) |
|
max_length_klingon = max(max(len(seq) for seq in klingon_train_sequences), max(len(seq) for seq in klingon_test_sequences)) |
|
|
|
|
|
english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=max_length_english, padding='post') |
|
klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=max_length_klingon, padding='post') |
|
english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=max_length_english, padding='post') |
|
klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=max_length_klingon, padding='post') |
|
|
|
|
|
english_train_input = english_train_padded[:, :-1] |
|
english_train_target = np.expand_dims(english_train_padded[:, 1:], -1) |
|
english_test_input = english_test_padded[:, :-1] |
|
english_test_target = np.expand_dims(english_test_padded[:, 1:], -1) |
|
|
|
return (klingon_tokenizer, english_tokenizer, klingon_tokenizer.get_vocab_size()+1, english_tokenizer.get_vocab_size()+1, |
|
klingon_train_padded, english_train_input, english_train_target, |
|
klingon_test_padded, english_test_input, english_test_target, |
|
max_length_klingon, max_length_english) |
|
|