Spaces:

DiegoTheExplorar
/

KlingonHeads

Running

App Files Files Community

DiegoTheExplorar commited on Jun 1, 2024

Commit

a2570fa

verified ·

1 Parent(s): f41dc5a

Upload 8 files

Browse files

Files changed (8) hide show

DataPPwithspecial.py +55 -0
Decoder.py +66 -0
Encoder.py +51 -0
English_to_Klingon.pth +3 -0
README.md +12 -12
Seq2SeqModel.py +71 -0
app.py +83 -0
requirements.txt +112 -0

DataPPwithspecial.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+def preprocess():
+    # Load dataset
+    data = pd.read_csv('./backend/English_To_Klingon.csv')
+    # Append <BOS> and <EOS> tags to the Klingon sentences
+    data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
+    # Separate the sentences
+    english_sentences = data['english'].values
+    klingon_sentences = data['klingon'].values
+    # Split data into training and testing sets. An 80 - 20 split is used here
+    english_train, english_test, klingon_train, klingon_test = train_test_split(
+        english_sentences, klingon_sentences, test_size=0.2, random_state=42)
+    # Initialize tokenizers with specified vocabulary size
+    english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
+    klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
+    # Fit tokenizers on training data
+    english_tokenizer.fit_on_texts(english_train)
+    klingon_tokenizer.fit_on_texts(klingon_train)
+    # Tokenize the sentences
+    english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
+    klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
+    english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
+    klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
+    # Padding sequences to a fixed length
+    english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
+    klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
+    english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
+    klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
+    # Prepare target data for training
+    klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
+    klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
+    klingon_train_target = np.expand_dims(klingon_train_target, -1)
+    # Prepare target data for testing
+    klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
+    klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
+    klingon_test_target = np.expand_dims(klingon_test_target, -1)
+    return (english_tokenizer, klingon_tokenizer, 50, # max_length
+            english_train_padded, klingon_train_input, klingon_train_target,
+            english_test_padded, klingon_test_input, klingon_test_target)

Decoder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch.nn as nn
+class Decoder(nn.Module):
+    """
+    Initailising GRU Decoder. Based on the hidden state(context vector)
+    my encoder has returned I want too make predictions to map
+    English to Klingon
+    Parameters:
+    ----------
+    input_dim : int
+        Size of the input vocabulary
+    emb_dim : int
+        Dimension of the embedding vectors
+    hid_dim : int
+        Number of features in the GRU's hidden state
+    n_layers : int
+        Number of GRU layers (typically 2)
+    dropout : float
+        Dropout probability for the dropout layer
+    """
+    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
+        super().__init__()
+        self.hid_dim = hid_dim
+        self.output_dim = output_dim
+        self.n_layers = n_layers
+        self.embedding = nn.Embedding(output_dim, emb_dim)
+        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
+        self.fc_out = nn.Linear(hid_dim, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    """
+        Forward propagation step of decoding
+        Parameters:
+        ----------
+        hidden : Tensor
+            Hidden tensor containing token indices (seq_len, batch_size)
+            This is what our encoder returns
+        trg : Tensor
+            Target tensor containing token indices (seq_len, batch_size)
+            This is what our tokenized Klingon Data
+        Returns:
+        -------
+        prediction : Tensor
+            Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
+        hidden : Tensor
+            Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
+    """
+    def forward(self, trg, hidden):
+        #unsure trg is 3D
+        trg = trg.unsqueeze(0)
+        #input is converted into embeddings and dropout probability is applied
+        embedded = self.dropout(self.embedding(trg))
+        #print("Embedded shape:", embedded.shape)
+        #GRU layer computes new context based on previous context
+        output, hidden = self.rnn(embedded, hidden)
+        #print("Output shape after RNN:", output.shape)
+        #predicts output from GRU
+        prediction = self.fc_out(output.squeeze(0))
+        #print("Output shape after fc_out:", output.shape)
+        return prediction, hidden

Encoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch.nn as nn
+class Encoder(nn.Module):
+    """
+    Seq2Seq Encoder for GRU model. I want to store any kind
+    of sequenital information to be passed on to the decoder
+    Parameters:
+    ----------
+    input_dim : int
+        Size of the input vocabulary
+    emb_dim : int
+        Dimension of the embedding vectors
+    hid_dim : int
+        Number of features in the GRU's hidden state
+    n_layers : int
+        Number of GRU layers (typically 2)
+    dropout : float
+        Dropout probability for the dropout layer
+    """
+    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
+        super().__init__()
+        # Embedding layer
+        self.embedding = nn.Embedding(input_dim, emb_dim)
+        self.hid_dim = hid_dim
+        self.n_layers = n_layers
+        # GRU layer
+        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout)
+    """
+        Forward propagation step of encoding
+        Parameters:
+        ----------
+        input : Tensor
+            Input tensor containing token indices (seq_len, batch_size)
+        Returns:
+        -------
+        hidden : Tensor
+            Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
+        """
+    def forward(self, input):
+        #input is converted into embeddings
+        embedded = self.dropout(self.embedding(input))
+        #forward pass into GRU and dropout probability is applied
+        _ , hidden = self.rnn(embedded)
+        #only hidden state is required for encoding
+        return hidden

English_to_Klingon.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdcacd00a6c6886c98c94c35c373b8eb7f563b38ce7046311a5a04036d6a7b40
+size 73477686

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: KlingonHeads
-emoji: 😻
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 4.32.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: KlingonHeads
+emoji: 😻
+colorFrom: purple
+colorTo: gray
+sdk: gradio
+sdk_version: 4.32.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Seq2SeqModel.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch.nn as nn
+import torch
+import random
+"""
+    This class puts together the decoder and encoder and
+    receives Klingon and Engish data from the tokenization process
+"""
+class Seq2SeqModel(nn.Module):
+    def __init__(self, encoder, decoder, device):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+        # cause encoder and decoder must have same no.of layers
+        assert (encoder.hid_dim == decoder.hid_dim), "Hidden dimensions of encoder and decoder not equal"
+        assert (encoder.n_layers == decoder.n_layers), "Encoder and decoder layers not equal"
+    """
+        Parameters:
+        ----------
+        input : Tensor
+            Input tensor containing token indices (seq_len, batch_size)
+            Tokenized English Data
+        trg : Tensor
+            Target tensor containing token indices (seq_len, batch_size)
+            This is what our tokenized Klingon Data
+        teacher_forcing_ratio: double
+            the % of time I use ground-truths aka during training
+        Returns:
+        -------
+        prediction : Tensor
+            Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
+        hidden : Tensor
+            Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
+    """
+    def forward(self,input, trg, teacher_forcing_ratio):
+        batch_size = trg.shape[1]
+        trg_length = trg.shape[0]
+        trg_size = self.decoder.output_dim
+        #storing decorder outputs
+        outputs = torch.zeros(trg_length,batch_size,trg_size).to(self.device)
+        #output of encoder used as input for decoder
+        hidden = self.encoder(input)
+        #print("Encoder hidden state shape:", hidden.shape)
+        # basically we want to single out the first input into the decoder as a
+        #start of sentence token. This is to let the decoder know when to start making predictions
+        input = trg[0, :]
+        for t in range(1, trg_length):
+           #forward pass through decoder. hidden here refers to context vector from
+           #encoder. hidden keeps getting updated
+            output, hidden = self.decoder(input, hidden)
+            #print("Decoder output shape:", output.shape)
+            #Here I am just storing all the predictions made
+            outputs[t] = output
+            #leaving usage of teacher forcing to chance
+            teacher_force = random.random() < teacher_forcing_ratio
+            #print("Output tensor shape in Seq to Seq:", output.shape)
+            # Get the highest predicted token from our predictions
+            highest = output.argmax(1)
+            # If teacher forcing is used use next token else  use predicted token
+            input = trg[t] if teacher_force else highest
+        return outputs

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import tensorflow as tf
+import gradio as gr
+import re
+from Seq2SeqModel import Seq2SeqModel
+from DataPPwithspecial import preprocess
+from Decoder import Decoder
+from Encoder import Encoder
+# Model parameters
+n_layers = 2
+emb_dim = 256
+hid_dim = 512
+dropout = 0.5
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available, otherwise use CPU
+# Load preprocessed data and model parameters
+(english_tokenizer, klingon_tokenizer, max_english_length,
+    _, _, _, _, _, _) = preprocess()  # We don't need training data for inference
+input_dim = len(english_tokenizer.word_index) + 1  # Add 1 for the padding token
+output_dim = len(klingon_tokenizer.word_index) + 1  # Add 1 for the padding token
+# Initialize encoder and decoder
+encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
+decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
+# Initialize the Seq2SeqModel
+model = Seq2SeqModel(encoder, decoder, device).to(device)
+# Load the saved model
+model.load_state_dict(torch.load('./backend/English_to_Klingon.pth'))
+model.eval()  # Set the model to evaluation mode
+#tokenize the English input
+def preprocess_sentence(sentence, tokenizer, max_length):
+    # Tokenize the sentence
+    tokenized_sentence = tokenizer.texts_to_sequences([sentence])
+    # Pad the sequence
+    padded_sentence = tf.keras.preprocessing.sequence.pad_sequences(tokenized_sentence, maxlen=max_length, padding='post')
+    return torch.tensor(padded_sentence, dtype=torch.long).to(device)
+# Translation function for Gradio
+def translate_english_to_klingon(english_sentence):
+    # Preprocess the input English sentence
+    input_sentence = preprocess_sentence(english_sentence, english_tokenizer, max_english_length)
+    # Remove the extra dimension added by unsqueeze(1)
+    input_sentence = input_sentence.squeeze(0)
+    # Perform inference
+    with torch.no_grad():
+        # Pass input as both input and target with teacher forcing ratio 0
+        output = model(input_sentence.unsqueeze(1), input_sentence.unsqueeze(1), 0)
+    # Convert output indices to Klingon words
+    output_indices = torch.argmax(output, dim=-1).squeeze().tolist()
+    klingon_sentence = ' '.join([klingon_tokenizer.index_word[idx] for idx in output_indices if idx != 0])  # Remove padding token
+    #regex to remove eos
+    klingon_sentence = re.sub(r'\beos\b', '', klingon_sentence).strip()
+    return klingon_sentence
+# Create Gradio interface
+examples = [
+    ["Hello, how are you?"],
+    ["What is your name?"],
+    ["I love learning new languages."],
+    ["Where is the nearest starbase?"],
+    ["Can you tell me more about your planet?"]
+]
+iface = gr.Interface(
+    fn=translate_english_to_klingon,
+    inputs=gr.Textbox(label = "English Phrase",lines=2, placeholder="Enter English text here..."),
+    outputs=gr.Textbox(label="Klingon Translation",lines=2),
+    title="English to Klingon Translation",
+    description="Enter text in English and get its translation in Klingon. This translator helps you convert everyday English phrases into the fictional language spoken by the Klingon species in the Star Trek universe. Try one of the example sentences to see how it works!",
+    examples=examples,
+    theme="default"
+)
+iface.launch(share = True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,112 @@

+absl-py==2.1.0
+aiofiles==23.2.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+astunparse==1.6.3
+attrs==23.2.0
+blinker==1.8.2
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.1
+cycler==0.12.1
+dnspython==2.6.1
+email_validator==2.1.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.14.0
+Flask==3.0.3
+flatbuffers==24.3.25
+fonttools==4.53.0
+fsspec==2024.5.0
+gast==0.5.4
+google-pasta==0.2.0
+gradio==4.32.1
+gradio_client==0.17.0
+grpcio==1.64.0
+h11==0.14.0
+h5py==3.11.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.2
+idna==3.7
+importlib_resources==6.4.0
+intel-openmp==2021.4.0
+itsdangerous==2.2.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+keras==3.3.3
+kiwisolver==1.4.5
+libclang==18.1.1
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mkl==2021.4.0
+ml-dtypes==0.3.2
+mpmath==1.3.0
+namex==0.0.8
+networkx==3.3
+numpy==1.26.4
+opt-einsum==3.3.0
+optree==0.11.0
+orjson==3.10.3
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+protobuf==4.25.3
+pydantic==2.7.2
+pydantic_core==2.18.3
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.7
+scikit-learn==1.5.0
+scipy==1.13.1
+semantic-version==2.10.0
+setuptools==70.0.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.37.2
+sympy==1.12.1
+tbb==2021.12.0
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+tensorflow==2.16.1
+tensorflow-intel==2.16.1
+termcolor==2.4.0
+threadpoolctl==3.5.0
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.0+cu118
+torchaudio==2.3.0+cu118
+torchvision==0.18.0+cu118
+tqdm==4.66.4
+typer==0.12.3
+typing_extensions==4.12.0
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.30.0
+watchfiles==0.22.0
+websockets==11.0.3
+Werkzeug==3.0.3
+wheel==0.43.0
+wrapt==1.16.0