In [1]:
!pip install transformers



In [16]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_seq_length):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.text = f.read()
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.segments = []

        for line in self.text.split('\n'):
            if len(line) > 0:
                if len(line) > self.max_seq_length:
                    # Split long lines into shorter segments
                    segments = [line[i:i+self.max_seq_length] for i in range(0, len(line), self.max_seq_length)]
                    self.segments.extend(segments)
                else:
                    self.segments.append(line)

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        self.tokenizer.pad_token_id = 0
        segment = self.segments[idx]
        input_ids = self.tokenizer.encode(segment, add_special_tokens=True)
        target_ids = input_ids[1:] + [self.tokenizer.pad_token_id]

        if not input_ids:
            return None
        

        # Pad the input sequence to the same length
        if len(input_ids) > self.max_seq_length:
            input_ids = input_ids[:self.max_seq_length]
            target_ids = target_ids[:self.max_seq_length]
        else:
            padding_length = self.max_seq_length - len(input_ids)
            input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
            target_ids = target_ids + [self.tokenizer.pad_token_id] * padding_length
        #print("segment:",segment,"\n input:", input_ids, "\n targets" ,target_ids)
        #print("\r segment:",segment, end="")
        
        return torch.tensor(input_ids), torch.tensor(target_ids)


# Initialize the tokenizer and the pre-trained model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)


# Define the path to the training text file
train_file = 'text.txt'

# Define the training and validation datasets
max_seq_length = 512

train_dataset = TextDataset(train_file, tokenizer, max_seq_length)
valid_dataset = TextDataset(train_file, tokenizer, max_seq_length)

# Define the training hyperparameters
batch_size = 8
num_epochs = 7
learning_rate = 5e-4

# Define the data loader for the training and validation datasets
train_loader = DataLoader(train_dataset,batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset,batch_size=batch_size, shuffle=True)

# Define the loss function and the optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

tokenizer.save_pretrained('fine_tuned_tokenizer')


('fine_tuned_tokenizer/tokenizer_config.json',
 'fine_tuned_tokenizer/special_tokens_map.json',
 'fine_tuned_tokenizer/vocab.json',
 'fine_tuned_tokenizer/merges.txt',
 'fine_tuned_tokenizer/added_tokens.json')

In [17]:
# Train the model for a fixed number of epochs
for epoch in range(num_epochs):
    # Training loop
    model.train()
    train_loss = 0
    i=0
    for batch in train_loader:
        i+=1
        input_ids, target_ids = batch
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, labels=target_ids)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), target_ids.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        print(f'\rEpoch {epoch+1}/{num_epochs}, Train Batch {i}/{len(train_loader)}, Train Loss: {train_loss/len(train_loader)}',end="")


    # Validation loop
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        i=0
        for batch in valid_loader:
            i+=1
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)
            outputs = model(input_ids, labels=target_ids)
            loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), target_ids.view(-1))
            valid_loss += loss.item()
            print(f'\rEpoch {epoch+1}/{num_epochs}, Valid Batch {i}/{len(valid_loader)}, Valid Loss: {valid_loss/len(valid_loader)}',end="")

    print(f'\rEpoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_loader)}, lr: {learning_rate}',end="\n")
    learning_rate /= 5
model.save_pretrained('brunosan/GPT2-impactscience')
tokenizer.save_pretrained('brunosan/GPT2-impactscience')

Epoch 1/7, Train Loss: 0.5704409927129745, Valid Loss: 0.3897556330009205, lr: 0.0005
Epoch 2/7, Train Loss: 0.403159585849541, Valid Loss: 0.24347291268953464, lr: 0.0001
Epoch 3/7, Train Loss: 0.27275778398644634, Valid Loss: 0.13453135721203757, lr: 2e-05
Epoch 4/7, Train Loss: 0.1717792261482739, Valid Loss: 0.07003633981775038, lr: 4.000000000000001e-06
Epoch 5/7, Train Loss: 0.10565993448764813, Valid Loss: 0.03993069042065522, lr: 8.000000000000002e-07
Epoch 6/7, Train Loss: 0.0703794076675322, Valid Loss: 0.02531332855408148, lr: 1.6000000000000003e-07
Epoch 7/7, Train Batch 7/82, Train Loss: 0.0036747022645502556

In [None]:
model.save_pretrained('brunosan/GPT2-impactscience')
tokenizer.save_pretrained('brunosan/GPT2-impactscience')

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define the device to run the model on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the fine-tuned model
model_path = 'brunosan/GPT2-impactscience'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

# Set the pad_token_id to the same value as the unk_token_id
#model.config.pad_token_id = tokenizer.unk_token_id

# Set the generation parameters
max_length = 100
num_beams = 5
no_repeat_ngram_size = 2
temperature = 1.0

# Generate text using beam search, n-grams, and other techniques
prompt = "The impact of climate change on "

def generate(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                          max_length=max_length, num_beams=num_beams,
                          no_repeat_ngram_size=no_repeat_ngram_size,
                          temperature=temperature, do_sample=True, top_p=0.95,
                          top_k=50)

    # Convert the generated output to string format
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

generate(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Antonio, who died in 2015, was a brilliant engineer who was able to master complex networks of increasingly complex projects. He was also very involved on airborne radars, ground planes, and hovercrafts. In the mid-90s, several European nations, some being primary producers of CFC, began considering regulations, or even creating public funds to assist their efforts. While extremely conscious of the environmental impact, at the same time, he also advocated for non-proliferation, peaceful'

In [None]:
model.push_to_hub("brunosan/GPT2-impactscience")
tokenizer.push_to_hub("brunosan/GPT2-impactscience")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/brunosan/GPT2-impactscience/commit/3316819787943d7a4693aee44c786ba4c2b4fc7f', commit_message='Upload tokenizer', commit_description='', oid='3316819787943d7a4693aee44c786ba4c2b4fc7f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.19.1-py3-none-any.whl (14.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.2/14.2 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting altair>=4.2.0
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m813.6/813.6 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mdit-py-plugins<=0.3.3
  Downloading mdit_py_plugins-0.3.3-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-

In [None]:
import gradio as gr
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = 'fine_tuned_model'
tokenizer_path = 'fine_tuned_tokenizer'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

# Define the generation function
def generate_text(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                              max_length=100, num_beams=9,
                              no_repeat_ngram_size=2,
                              temperature=1.0, do_sample=True,
                              top_p=0.95, top_k=50)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Create a Gradio interface
input_text = gr.inputs.Textbox(lines=2, label="Enter the starting text")
output_text = gr.outputs.Textbox(label="Generated Text")

interface = gr.Interface(fn=generate_text, inputs=input_text, outputs=output_text,
             title="GPT-2 Impact Science Text Generator", description="Generate text using a fine-tuned GPT-2 model onthe Impact Science book.")

# Export the Gradio interface to the Hugging Face Model Hub
interface.launch(share=True)




Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://6989ff3075913a1f9b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
