stakelovelace commited on
Commit
2094fe7
·
1 Parent(s): 3b6b2b0
app.py CHANGED
@@ -1,42 +1,41 @@
1
- import pandas as pd
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
4
- import csv
5
- import yaml
6
  from datasets import Dataset
 
 
 
7
 
8
  import tensorflow as tf
9
  # Check TensorFlow GPU availability
10
  print("GPUs Available: ", tf.config.list_physical_devices('GPU'))
11
 
12
  import os
 
13
  os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
14
 
 
 
 
 
 
 
 
 
 
15
  def load_data_and_config(data_path):
16
  """Loads training data from CSV."""
17
  data = []
18
  with open(data_path, newline='', encoding='utf-8') as csvfile:
19
- reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file
20
  for row in reader:
21
- data.append({'text': row['description']}) # Changed from 'text' to 'description'
22
  return data
23
 
24
- def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
25
- """Generates an API query using a fine-tuned model."""
26
- input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
27
- input_ids = input_ids.to(model.device) # Ensure input_ids are on the same device as the model
28
- output = model.generate(input_ids, max_length=256, temperature=0.7, do_sample=True) # Enable sampling with temperature control
29
- query = tokenizer.decode(output[0], skip_special_tokens=True)
30
- return f"{base_url}/{query}"
31
-
32
- from transformers import TrainingArguments, Trainer
33
-
34
- def train_model(model, tokenizer, data):
35
  """Trains the model using the Hugging Face Trainer API."""
36
- # Encode data and prepare labels
37
  inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
38
  dataset = Dataset.from_dict({
39
- 'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions
40
  'labels': [x['input_ids'].squeeze() for x in inputs]
41
  })
42
 
@@ -50,47 +49,53 @@ def train_model(model, tokenizer, data):
50
  logging_dir='./logs',
51
  logging_steps=10,
52
  )
53
-
54
  trainer = Trainer(
55
  model=model,
56
  args=training_args,
57
  train_dataset=dataset,
58
  tokenizer=tokenizer
59
  )
60
-
61
- # The Trainer handles the training loop internally
62
  trainer.train()
63
 
64
- # Optionally clear cache if using GPU or MPS
65
- if torch.cuda.is_available():
66
- torch.cuda.empty_cache()
67
- elif torch.backends.mps.is_built():
68
- torch.mps.empty_cache()
69
-
70
  # Perform any remaining steps such as logging, saving, etc.
71
  trainer.save_model()
72
 
73
  def main(api_name, base_url):
74
- # Load data
75
  data = load_data_and_config("train2.csv")
 
 
 
 
 
 
76
 
77
- # Load tokenizer and model
78
- tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
79
- model = AutoModelForCausalLM.from_pretrained("thenlper/gte-small")
80
-
81
- # Train the model on your dataset
82
- train_model(model, tokenizer, data)
83
 
84
- # Save the fine-tuned model
85
  model.save_pretrained("./fine_tuned_model")
86
  tokenizer.save_pretrained("./fine_tuned_model")
87
 
88
- # Example usage
89
  prompt = "I need to retrieve the latest block on chain using a python script"
90
  api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", api_name, base_url)
91
  print(f"Generated code: {api_query}")
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  if __name__ == "__main__":
94
  api_name = "Koios"
95
- base_url = "https://api.koios.rest"
96
  main(api_name, base_url)
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BertLMHeadModel, BertForSequenceClassification
 
 
3
  from datasets import Dataset
4
+ import pandas as pd
5
+ import csv
6
+ from transformers import TrainingArguments, Trainer
7
 
8
  import tensorflow as tf
9
  # Check TensorFlow GPU availability
10
  print("GPUs Available: ", tf.config.list_physical_devices('GPU'))
11
 
12
  import os
13
+ # Setting the environment variable for MPS
14
  os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
15
 
16
+ def get_device():
17
+ """Automatically chooses the best device."""
18
+ if torch.cuda.is_available():
19
+ return torch.device('cuda')
20
+ elif torch.backends.mps.is_available():
21
+ return torch.device('mps')
22
+ else:
23
+ return torch.device('cpu')
24
+
25
  def load_data_and_config(data_path):
26
  """Loads training data from CSV."""
27
  data = []
28
  with open(data_path, newline='', encoding='utf-8') as csvfile:
29
+ reader = csv.DictReader(csvfile, delimiter=';')
30
  for row in reader:
31
+ data.append({'text': row['description']})
32
  return data
33
 
34
+ def train_model(model, tokenizer, data, device):
 
 
 
 
 
 
 
 
 
 
35
  """Trains the model using the Hugging Face Trainer API."""
 
36
  inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
37
  dataset = Dataset.from_dict({
38
+ 'input_ids': [x['input_ids'].squeeze() for x in inputs],
39
  'labels': [x['input_ids'].squeeze() for x in inputs]
40
  })
41
 
 
49
  logging_dir='./logs',
50
  logging_steps=10,
51
  )
52
+
53
  trainer = Trainer(
54
  model=model,
55
  args=training_args,
56
  train_dataset=dataset,
57
  tokenizer=tokenizer
58
  )
59
+
 
60
  trainer.train()
61
 
 
 
 
 
 
 
62
  # Perform any remaining steps such as logging, saving, etc.
63
  trainer.save_model()
64
 
65
  def main(api_name, base_url):
66
+ device = get_device() # Get the appropriate device
67
  data = load_data_and_config("train2.csv")
68
+ tokenizer = AutoTokenizer.from_pretrained("google/codegemma-2b")
69
+ model = AutoModelForCausalLM.from_pretrained('google/codegemma-2b', is_decoder=True)
70
+ #model = BertLMHeadModel.from_pretrained('google/codegemma-2b', is_decoder=True)
71
+ # Example assuming you have a prepared dataset for classification
72
+ #model = BertForSequenceClassification.from_pretrained('thenlper/gte-small', num_labels=2, is_decoder=True) # binary classification
73
+ model.to(device) # Move model to the appropriate device
74
 
75
+ train_model(model, tokenizer, data, device)
 
 
 
 
 
76
 
 
77
  model.save_pretrained("./fine_tuned_model")
78
  tokenizer.save_pretrained("./fine_tuned_model")
79
 
 
80
  prompt = "I need to retrieve the latest block on chain using a python script"
81
  api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", api_name, base_url)
82
  print(f"Generated code: {api_query}")
83
 
84
+ def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
85
+ # Prepare input prompt for the model, ensure tensors are compatible with PyTorch
86
+ input_ids = tokenizer.encode(f"{prompt} Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
87
+
88
+ # Ensure input_ids are on the same device as the model
89
+ input_ids = input_ids.to(model.device)
90
+
91
+ # Generate query using model with temperature for randomness
92
+ output = model.generate(input_ids, max_length=256, temperature=0.1, do_sample=True)
93
+
94
+ # Decode the generated query tokens
95
+ query = tokenizer.decode(output[0], skip_special_tokens=True)
96
+ return f"{base_url}/{query}"
97
+
98
  if __name__ == "__main__":
99
  api_name = "Koios"
100
+ base_url = "https://api.koios.rest/v1"
101
  main(api_name, base_url)
logs/events.out.tfevents.1714322367.172-3-0-7.lightspeed.irvnca.sbcglobal.net.39122.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83827f2cf7d20a317b97a09a293ebac35eb1e809d395d2ec317c06950d3f40c6
3
+ size 6596
results/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b4ed4bb45f70522e361ac23b7d2e031a99706cbde4e236374a52b3d6b0b7a2
3
+ size 133588624