DiegoTheExplorar commited on
Commit
a2570fa
·
verified ·
1 Parent(s): f41dc5a

Upload 8 files

Browse files
Files changed (8) hide show
  1. DataPPwithspecial.py +55 -0
  2. Decoder.py +66 -0
  3. Encoder.py +51 -0
  4. English_to_Klingon.pth +3 -0
  5. README.md +12 -12
  6. Seq2SeqModel.py +71 -0
  7. app.py +83 -0
  8. requirements.txt +112 -0
DataPPwithspecial.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ import tensorflow as tf
5
+
6
+ def preprocess():
7
+ # Load dataset
8
+ data = pd.read_csv('./backend/English_To_Klingon.csv')
9
+
10
+
11
+ # Append <BOS> and <EOS> tags to the Klingon sentences
12
+ data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
13
+
14
+ # Separate the sentences
15
+ english_sentences = data['english'].values
16
+ klingon_sentences = data['klingon'].values
17
+
18
+ # Split data into training and testing sets. An 80 - 20 split is used here
19
+ english_train, english_test, klingon_train, klingon_test = train_test_split(
20
+ english_sentences, klingon_sentences, test_size=0.2, random_state=42)
21
+
22
+ # Initialize tokenizers with specified vocabulary size
23
+ english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
24
+ klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
25
+
26
+ # Fit tokenizers on training data
27
+ english_tokenizer.fit_on_texts(english_train)
28
+ klingon_tokenizer.fit_on_texts(klingon_train)
29
+
30
+ # Tokenize the sentences
31
+ english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
32
+ klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
33
+ english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
34
+ klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
35
+
36
+ # Padding sequences to a fixed length
37
+ english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
38
+ klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
39
+ english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
40
+ klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
41
+
42
+ # Prepare target data for training
43
+ klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
44
+ klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
45
+ klingon_train_target = np.expand_dims(klingon_train_target, -1)
46
+
47
+ # Prepare target data for testing
48
+ klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
49
+ klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
50
+ klingon_test_target = np.expand_dims(klingon_test_target, -1)
51
+
52
+ return (english_tokenizer, klingon_tokenizer, 50, # max_length
53
+ english_train_padded, klingon_train_input, klingon_train_target,
54
+ english_test_padded, klingon_test_input, klingon_test_target)
55
+
Decoder.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ class Decoder(nn.Module):
3
+ """
4
+ Initailising GRU Decoder. Based on the hidden state(context vector)
5
+ my encoder has returned I want too make predictions to map
6
+ English to Klingon
7
+
8
+ Parameters:
9
+ ----------
10
+ input_dim : int
11
+ Size of the input vocabulary
12
+ emb_dim : int
13
+ Dimension of the embedding vectors
14
+ hid_dim : int
15
+ Number of features in the GRU's hidden state
16
+ n_layers : int
17
+ Number of GRU layers (typically 2)
18
+ dropout : float
19
+ Dropout probability for the dropout layer
20
+
21
+ """
22
+ def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
23
+ super().__init__()
24
+ self.hid_dim = hid_dim
25
+ self.output_dim = output_dim
26
+ self.n_layers = n_layers
27
+ self.embedding = nn.Embedding(output_dim, emb_dim)
28
+ self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
29
+ self.fc_out = nn.Linear(hid_dim, output_dim)
30
+ self.dropout = nn.Dropout(dropout)
31
+
32
+ """
33
+ Forward propagation step of decoding
34
+
35
+ Parameters:
36
+ ----------
37
+ hidden : Tensor
38
+ Hidden tensor containing token indices (seq_len, batch_size)
39
+ This is what our encoder returns
40
+
41
+ trg : Tensor
42
+ Target tensor containing token indices (seq_len, batch_size)
43
+ This is what our tokenized Klingon Data
44
+
45
+ Returns:
46
+ -------
47
+ prediction : Tensor
48
+ Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
49
+
50
+ hidden : Tensor
51
+ Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
52
+ """
53
+
54
+ def forward(self, trg, hidden):
55
+ #unsure trg is 3D
56
+ trg = trg.unsqueeze(0)
57
+ #input is converted into embeddings and dropout probability is applied
58
+ embedded = self.dropout(self.embedding(trg))
59
+ #print("Embedded shape:", embedded.shape)
60
+ #GRU layer computes new context based on previous context
61
+ output, hidden = self.rnn(embedded, hidden)
62
+ #print("Output shape after RNN:", output.shape)
63
+ #predicts output from GRU
64
+ prediction = self.fc_out(output.squeeze(0))
65
+ #print("Output shape after fc_out:", output.shape)
66
+ return prediction, hidden
Encoder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class Encoder(nn.Module):
4
+ """
5
+ Seq2Seq Encoder for GRU model. I want to store any kind
6
+ of sequenital information to be passed on to the decoder
7
+
8
+ Parameters:
9
+ ----------
10
+ input_dim : int
11
+ Size of the input vocabulary
12
+ emb_dim : int
13
+ Dimension of the embedding vectors
14
+ hid_dim : int
15
+ Number of features in the GRU's hidden state
16
+ n_layers : int
17
+ Number of GRU layers (typically 2)
18
+ dropout : float
19
+ Dropout probability for the dropout layer
20
+ """
21
+ def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
22
+ super().__init__()
23
+ # Embedding layer
24
+ self.embedding = nn.Embedding(input_dim, emb_dim)
25
+ self.hid_dim = hid_dim
26
+ self.n_layers = n_layers
27
+ # GRU layer
28
+ self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
29
+ # Dropout layer
30
+ self.dropout = nn.Dropout(dropout)
31
+
32
+ """
33
+ Forward propagation step of encoding
34
+
35
+ Parameters:
36
+ ----------
37
+ input : Tensor
38
+ Input tensor containing token indices (seq_len, batch_size)
39
+
40
+ Returns:
41
+ -------
42
+ hidden : Tensor
43
+ Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
44
+ """
45
+ def forward(self, input):
46
+ #input is converted into embeddings
47
+ embedded = self.dropout(self.embedding(input))
48
+ #forward pass into GRU and dropout probability is applied
49
+ _ , hidden = self.rnn(embedded)
50
+ #only hidden state is required for encoding
51
+ return hidden
English_to_Klingon.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcacd00a6c6886c98c94c35c373b8eb7f563b38ce7046311a5a04036d6a7b40
3
+ size 73477686
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: KlingonHeads
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.32.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: KlingonHeads
3
+ emoji: 😻
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.32.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Seq2SeqModel.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import random
4
+ """
5
+ This class puts together the decoder and encoder and
6
+ receives Klingon and Engish data from the tokenization process
7
+
8
+ """
9
+
10
+ class Seq2SeqModel(nn.Module):
11
+ def __init__(self, encoder, decoder, device):
12
+ super().__init__()
13
+ self.encoder = encoder
14
+ self.decoder = decoder
15
+ self.device = device
16
+ # cause encoder and decoder must have same no.of layers
17
+ assert (encoder.hid_dim == decoder.hid_dim), "Hidden dimensions of encoder and decoder not equal"
18
+ assert (encoder.n_layers == decoder.n_layers), "Encoder and decoder layers not equal"
19
+
20
+ """
21
+ Parameters:
22
+ ----------
23
+ input : Tensor
24
+ Input tensor containing token indices (seq_len, batch_size)
25
+ Tokenized English Data
26
+
27
+ trg : Tensor
28
+ Target tensor containing token indices (seq_len, batch_size)
29
+ This is what our tokenized Klingon Data
30
+
31
+ teacher_forcing_ratio: double
32
+ the % of time I use ground-truths aka during training
33
+ Returns:
34
+ -------
35
+ prediction : Tensor
36
+ Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
37
+
38
+ hidden : Tensor
39
+ Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
40
+ """
41
+ def forward(self,input, trg, teacher_forcing_ratio):
42
+ batch_size = trg.shape[1]
43
+ trg_length = trg.shape[0]
44
+ trg_size = self.decoder.output_dim
45
+ #storing decorder outputs
46
+ outputs = torch.zeros(trg_length,batch_size,trg_size).to(self.device)
47
+ #output of encoder used as input for decoder
48
+ hidden = self.encoder(input)
49
+ #print("Encoder hidden state shape:", hidden.shape)
50
+ # basically we want to single out the first input into the decoder as a
51
+ #start of sentence token. This is to let the decoder know when to start making predictions
52
+ input = trg[0, :]
53
+ for t in range(1, trg_length):
54
+ #forward pass through decoder. hidden here refers to context vector from
55
+ #encoder. hidden keeps getting updated
56
+ output, hidden = self.decoder(input, hidden)
57
+ #print("Decoder output shape:", output.shape)
58
+ #Here I am just storing all the predictions made
59
+ outputs[t] = output
60
+
61
+ #leaving usage of teacher forcing to chance
62
+ teacher_force = random.random() < teacher_forcing_ratio
63
+ #print("Output tensor shape in Seq to Seq:", output.shape)
64
+
65
+ # Get the highest predicted token from our predictions
66
+ highest = output.argmax(1)
67
+
68
+ # If teacher forcing is used use next token else use predicted token
69
+ input = trg[t] if teacher_force else highest
70
+
71
+ return outputs
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import tensorflow as tf
3
+ import gradio as gr
4
+ import re
5
+
6
+ from Seq2SeqModel import Seq2SeqModel
7
+ from DataPPwithspecial import preprocess
8
+ from Decoder import Decoder
9
+ from Encoder import Encoder
10
+ # Model parameters
11
+ n_layers = 2
12
+ emb_dim = 256
13
+ hid_dim = 512
14
+ dropout = 0.5
15
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Use GPU if available, otherwise use CPU
16
+
17
+ # Load preprocessed data and model parameters
18
+ (english_tokenizer, klingon_tokenizer, max_english_length,
19
+ _, _, _, _, _, _) = preprocess() # We don't need training data for inference
20
+ input_dim = len(english_tokenizer.word_index) + 1 # Add 1 for the padding token
21
+ output_dim = len(klingon_tokenizer.word_index) + 1 # Add 1 for the padding token
22
+
23
+ # Initialize encoder and decoder
24
+ encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
25
+ decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
26
+
27
+ # Initialize the Seq2SeqModel
28
+ model = Seq2SeqModel(encoder, decoder, device).to(device)
29
+
30
+ # Load the saved model
31
+ model.load_state_dict(torch.load('./backend/English_to_Klingon.pth'))
32
+ model.eval() # Set the model to evaluation mode
33
+
34
+ #tokenize the English input
35
+ def preprocess_sentence(sentence, tokenizer, max_length):
36
+ # Tokenize the sentence
37
+ tokenized_sentence = tokenizer.texts_to_sequences([sentence])
38
+ # Pad the sequence
39
+ padded_sentence = tf.keras.preprocessing.sequence.pad_sequences(tokenized_sentence, maxlen=max_length, padding='post')
40
+ return torch.tensor(padded_sentence, dtype=torch.long).to(device)
41
+
42
+ # Translation function for Gradio
43
+ def translate_english_to_klingon(english_sentence):
44
+ # Preprocess the input English sentence
45
+ input_sentence = preprocess_sentence(english_sentence, english_tokenizer, max_english_length)
46
+
47
+ # Remove the extra dimension added by unsqueeze(1)
48
+ input_sentence = input_sentence.squeeze(0)
49
+
50
+ # Perform inference
51
+ with torch.no_grad():
52
+ # Pass input as both input and target with teacher forcing ratio 0
53
+ output = model(input_sentence.unsqueeze(1), input_sentence.unsqueeze(1), 0)
54
+
55
+ # Convert output indices to Klingon words
56
+ output_indices = torch.argmax(output, dim=-1).squeeze().tolist()
57
+ klingon_sentence = ' '.join([klingon_tokenizer.index_word[idx] for idx in output_indices if idx != 0]) # Remove padding token
58
+ #regex to remove eos
59
+ klingon_sentence = re.sub(r'\beos\b', '', klingon_sentence).strip()
60
+ return klingon_sentence
61
+
62
+
63
+ # Create Gradio interface
64
+ examples = [
65
+ ["Hello, how are you?"],
66
+ ["What is your name?"],
67
+ ["I love learning new languages."],
68
+ ["Where is the nearest starbase?"],
69
+ ["Can you tell me more about your planet?"]
70
+ ]
71
+
72
+ iface = gr.Interface(
73
+ fn=translate_english_to_klingon,
74
+ inputs=gr.Textbox(label = "English Phrase",lines=2, placeholder="Enter English text here..."),
75
+ outputs=gr.Textbox(label="Klingon Translation",lines=2),
76
+ title="English to Klingon Translation",
77
+ description="Enter text in English and get its translation in Klingon. This translator helps you convert everyday English phrases into the fictional language spoken by the Klingon species in the Star Trek universe. Try one of the example sentences to see how it works!",
78
+ examples=examples,
79
+ theme="default"
80
+ )
81
+
82
+ iface.launch(share = True)
83
+
requirements.txt ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ altair==5.3.0
4
+ annotated-types==0.7.0
5
+ anyio==4.4.0
6
+ astunparse==1.6.3
7
+ attrs==23.2.0
8
+ blinker==1.8.2
9
+ certifi==2024.2.2
10
+ charset-normalizer==3.3.2
11
+ click==8.1.7
12
+ colorama==0.4.6
13
+ contourpy==1.2.1
14
+ cycler==0.12.1
15
+ dnspython==2.6.1
16
+ email_validator==2.1.1
17
+ fastapi==0.111.0
18
+ fastapi-cli==0.0.4
19
+ ffmpy==0.3.2
20
+ filelock==3.14.0
21
+ Flask==3.0.3
22
+ flatbuffers==24.3.25
23
+ fonttools==4.53.0
24
+ fsspec==2024.5.0
25
+ gast==0.5.4
26
+ google-pasta==0.2.0
27
+ gradio==4.32.1
28
+ gradio_client==0.17.0
29
+ grpcio==1.64.0
30
+ h11==0.14.0
31
+ h5py==3.11.0
32
+ httpcore==1.0.5
33
+ httptools==0.6.1
34
+ httpx==0.27.0
35
+ huggingface-hub==0.23.2
36
+ idna==3.7
37
+ importlib_resources==6.4.0
38
+ intel-openmp==2021.4.0
39
+ itsdangerous==2.2.0
40
+ Jinja2==3.1.4
41
+ joblib==1.4.2
42
+ jsonschema==4.22.0
43
+ jsonschema-specifications==2023.12.1
44
+ keras==3.3.3
45
+ kiwisolver==1.4.5
46
+ libclang==18.1.1
47
+ Markdown==3.6
48
+ markdown-it-py==3.0.0
49
+ MarkupSafe==2.1.5
50
+ matplotlib==3.9.0
51
+ mdurl==0.1.2
52
+ mkl==2021.4.0
53
+ ml-dtypes==0.3.2
54
+ mpmath==1.3.0
55
+ namex==0.0.8
56
+ networkx==3.3
57
+ numpy==1.26.4
58
+ opt-einsum==3.3.0
59
+ optree==0.11.0
60
+ orjson==3.10.3
61
+ packaging==24.0
62
+ pandas==2.2.2
63
+ pillow==10.3.0
64
+ protobuf==4.25.3
65
+ pydantic==2.7.2
66
+ pydantic_core==2.18.3
67
+ pydub==0.25.1
68
+ Pygments==2.18.0
69
+ pyparsing==3.1.2
70
+ python-dateutil==2.9.0.post0
71
+ python-dotenv==1.0.1
72
+ python-multipart==0.0.9
73
+ pytz==2024.1
74
+ PyYAML==6.0.1
75
+ referencing==0.35.1
76
+ requests==2.32.3
77
+ rich==13.7.1
78
+ rpds-py==0.18.1
79
+ ruff==0.4.7
80
+ scikit-learn==1.5.0
81
+ scipy==1.13.1
82
+ semantic-version==2.10.0
83
+ setuptools==70.0.0
84
+ shellingham==1.5.4
85
+ six==1.16.0
86
+ sniffio==1.3.1
87
+ starlette==0.37.2
88
+ sympy==1.12.1
89
+ tbb==2021.12.0
90
+ tensorboard==2.16.2
91
+ tensorboard-data-server==0.7.2
92
+ tensorflow==2.16.1
93
+ tensorflow-intel==2.16.1
94
+ termcolor==2.4.0
95
+ threadpoolctl==3.5.0
96
+ tomlkit==0.12.0
97
+ toolz==0.12.1
98
+ torch==2.3.0+cu118
99
+ torchaudio==2.3.0+cu118
100
+ torchvision==0.18.0+cu118
101
+ tqdm==4.66.4
102
+ typer==0.12.3
103
+ typing_extensions==4.12.0
104
+ tzdata==2024.1
105
+ ujson==5.10.0
106
+ urllib3==2.2.1
107
+ uvicorn==0.30.0
108
+ watchfiles==0.22.0
109
+ websockets==11.0.3
110
+ Werkzeug==3.0.3
111
+ wheel==0.43.0
112
+ wrapt==1.16.0