Upload 4 files

Browse files

Files changed (4) hide show

continue.py +117 -0
inference.py +83 -0
tokenizer.js +26 -0
trainer.py +113 -0

continue.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from safetensors.torch import load_file, save_file
+import logging
+import json
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Hyperparameters
+sequence_length = 16
+batch_size = 32
+num_epochs = 1  # Continue training for 1 more epoch
+learning_rate = 0.00001
+embedding_dim = 256
+hidden_dim = 512
+num_layers = 2
+# LSTM Model
+class LSTMModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
+        super(LSTMModel, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, x):
+        embeds = self.embedding(x)
+        lstm_out, _ = self.lstm(embeds)
+        logits = self.fc(lstm_out[:, -1, :])
+        return logits
+# Load the model and vocabulary
+logging.info('Loading the model and vocabulary...')
+model_state_dict = load_file('lstm_model.safetensors')
+with open('word2idx.pkl', 'rb') as f:
+    word2idx = pickle.load(f)
+with open('idx2word.pkl', 'rb') as f:
+    idx2word = pickle.load(f)
+vocab_size = len(word2idx)
+model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
+model.load_state_dict(model_state_dict)
+model.train()
+logging.info('Model and vocabulary loaded successfully.')
+# Output the total number of parameters
+total_params = sum(p.numel() for p in model.parameters())
+logging.info(f'Total number of parameters: {total_params}')
+# Read the text file
+logging.info('Reading the text file...')
+with open('text.txt', 'r') as file:
+    text = file.read()
+logging.info('Text file read successfully.')
+# Preprocess the text
+logging.info('Preprocessing the text...')
+words = json.loads(text)
+sequences = []
+for i in range(len(words) - sequence_length):
+    seq = words[i:i + sequence_length]
+    label = words[i + sequence_length]
+    sequences.append((seq, label))
+logging.info(f'Number of sequences: {len(sequences)}')
+# Dataset and DataLoader
+class TextDataset(Dataset):
+    def __init__(self, sequences, word2idx):
+        self.sequences = sequences
+        self.word2idx = word2idx
+    def __len__(self):
+        return len(self.sequences)
+    def __getitem__(self, idx):
+        seq, label = self.sequences[idx]
+        seq_idx = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in seq]
+        label_idx = self.word2idx.get(label, self.word2idx['<UNK>'])
+        return torch.tensor(seq_idx, dtype=torch.long), torch.tensor(label_idx, dtype=torch.long)
+logging.info('Creating dataset and dataloader...')
+dataset = TextDataset(sequences, word2idx)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+# Continue training
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+logging.info('Starting continued training...')
+for epoch in range(num_epochs):
+    for batch_idx, batch in enumerate(dataloader):
+        inputs, targets = batch
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 10 == 0:
+            logging.info(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}')
+# Save the updated model
+logging.info('Saving the updated model...')
+save_file(model.state_dict(), 'lstm_model.safetensors')
+with open('word2idx.pkl', 'wb') as f:
+    pickle.dump(word2idx, f)
+with open('idx2word.pkl', 'wb') as f:
+    pickle.dump(idx2word, f)
+logging.info('Updated model and vocabulary saved successfully.')

inference.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pickle
+from safetensors.torch import load_file
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Hyperparameters
+embedding_dim = 8
+hidden_dim = 16
+num_layers = 1
+sequence_length = 64
+temp = 1.0  # Temperature parameter
+top_k = 10  # Top-k sampling parameter
+# LSTM Model
+class LSTMModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
+        super(LSTMModel, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, x):
+        embeds = self.embedding(x)
+        lstm_out, _ = self.lstm(embeds)
+        logits = self.fc(lstm_out[:, -1, :])
+        return logits
+# Load the model and vocabulary
+logging.info('Loading the model and vocabulary...')
+model_state_dict = load_file('lstm_model.safetensors')
+with open('word2idx.pkl', 'rb') as f:
+    word2idx = pickle.load(f)
+with open('idx2word.pkl', 'rb') as f:
+    idx2word = pickle.load(f)
+vocab_size = len(word2idx)
+model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
+model.load_state_dict(model_state_dict)
+model.eval()
+logging.info('Model and vocabulary loaded successfully.')
+# Function to predict the next word with temperature and top-k sampling
+def predict_next_word(model, word2idx, idx2word, sequence, sequence_length, temp, top_k):
+    model.eval()
+    with torch.no_grad():
+        seq_idx = [word2idx.get(word, word2idx['<UNK>']) for word in sequence.split()]
+        seq_idx = seq_idx[-sequence_length:]  # Ensure the sequence length is correct
+        seq_tensor = torch.tensor(seq_idx, dtype=torch.long).unsqueeze(0)
+        outputs = model(seq_tensor)
+        outputs = outputs / temp  # Apply temperature
+        probs = F.softmax(outputs, dim=1).squeeze()
+        top_k_probs, top_k_idx = torch.topk(probs, top_k)
+        predicted_idx = torch.multinomial(top_k_probs, 1).item()
+        predicted_word = idx2word[top_k_idx[predicted_idx].item()]
+        return predicted_word
+# Function to generate a sentence
+def generate_sentence(model, word2idx, idx2word, start_sequence, sequence_length, temp, top_k, max_length=50):
+    sentence = start_sequence
+    for _ in range(max_length):
+        next_word = predict_next_word(model, word2idx, idx2word, sentence, sequence_length, temp, top_k)
+        sentence += ' ' + next_word
+        if next_word == '<pad>' or next_word == 'User':
+            break
+    return sentence
+# Example usage
+start_sequence = "User : What is the capital of France ? Bot :"
+temp = 0.5 # Adjust temperature
+top_k = 32 # Adjust top-k
+logging.info(f'Starting sequence: {start_sequence}')
+logging.info(f'Temperature: {temp}, Top-k: {top_k}')
+generated_sentence = generate_sentence(model, word2idx, idx2word, start_sequence, sequence_length, temp, top_k)
+logging.info(f'Generated sentence: {generated_sentence}')

tokenizer.js ADDED Viewed

	@@ -0,0 +1,26 @@

+const fs = require('fs');
+function tokenizeText(text) {
+  return text.split(/([\s,.!?:;()*-])/).filter(token => token.trim() !== '');
+}
+fs.readFile('text.txt', 'utf8', (err, data) => {
+  if (err) {
+    console.error('Error reading file:', err);
+    return;
+  }
+  const tokens = tokenizeText(data);
+  const jsonData = JSON.stringify(tokens);
+  fs.writeFile('tokens.json', jsonData, (err) => {
+    if (err) {
+      console.error('Error writing file:', err);
+    } else {
+      console.log('Tokens written to tokens.json');
+    }
+  });
+});

trainer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from safetensors.torch import save_file
+import logging
+import json
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Hyperparameters
+sequence_length = 64
+batch_size = 1
+num_epochs = 1
+learning_rate = 0.00001
+embedding_dim = 256
+hidden_dim = 800
+num_layers = 4
+# Read the text file
+logging.info('Reading the text file...')
+with open('text.txt', 'r') as file:
+    text = file.read()
+logging.info('Text file read successfully.')
+# Preprocess the text
+logging.info('Preprocessing the text...')
+words = json.loads(text)
+vocab = set(words)
+vocab.add('<pad>')
+vocab.add('<UNK>')
+word2idx = {word: idx for idx, word in enumerate(vocab)}
+idx2word = {idx: word for idx, word in enumerate(vocab)}
+vocab_size = len(vocab)
+logging.info(f'Vocabulary size: {vocab_size}')
+#logging.info(f'Word to index mapping: {word2idx}')
+# Create sequences
+logging.info('Creating sequences...')
+sequences = []
+for i in range(len(words) - sequence_length):
+    seq = words[i:i + sequence_length]
+    label = words[i + sequence_length]
+    sequences.append((seq, label))
+logging.info(f'Number of sequences: {len(sequences)}')
+# Dataset and DataLoader
+class TextDataset(Dataset):
+    def __init__(self, sequences, word2idx):
+        self.sequences = sequences
+        self.word2idx = word2idx
+    def __len__(self):
+        return len(self.sequences)
+    def __getitem__(self, idx):
+        seq, label = self.sequences[idx]
+        seq_idx = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in seq]
+        label_idx = self.word2idx.get(label, self.word2idx['<UNK>'])
+        return torch.tensor(seq_idx, dtype=torch.long), torch.tensor(label_idx, dtype=torch.long)
+logging.info('Creating dataset and dataloader...')
+dataset = TextDataset(sequences, word2idx)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+# LSTM Model
+class LSTMModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
+        super(LSTMModel, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, x):
+        embeds = self.embedding(x)
+        lstm_out, _ = self.lstm(embeds)
+        logits = self.fc(lstm_out[:, -1, :])
+        return logits
+logging.info('Initializing the LSTM model...')
+model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+# Training loop
+logging.info('Starting training...')
+for epoch in range(num_epochs):
+    for batch_idx, batch in enumerate(dataloader):
+        inputs, targets = batch
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 10 == 0:
+            logging.info(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}')
+# Save the model
+logging.info('Saving the model...')
+save_file(model.state_dict(), 'lstm_model.safetensors')
+with open('word2idx.pkl', 'wb') as f:
+    pickle.dump(word2idx, f)
+with open('idx2word.pkl', 'wb') as f:
+    pickle.dump(idx2word, f)
+logging.info('Model and vocabulary saved successfully.')