import sagemaker
from sagemaker.huggingface import HuggingFace
# gets role for executing training job
role = sagemaker.get_execution_role()
hyperparameters = {
'epochs': 1,
'train_batch_size': 128,
'model_name_or_path':'EleutherAI/gpt-j-6B',
'output_dir':'/opt/ml/model'
# add your remaining hyperparameters
# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/language-modeling
}
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}
# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
entry_point='run_clm.py',
source_dir='./examples/pytorch/language-modeling',
instance_type='ml.p3.2xlarge',
instance_count=1,
role=role,
git_config=git_config,
transformers_version='4.6.1',
pytorch_version='1.7.1',
py_version='py36',
hyperparameters = hyperparameters
)
# starting the train job
huggingface_estimator.fit({'training': 's3://domain-gen-data/domain-gen-training.jsonl'})
above is the code and below is the error
2021-08-31 07:31:41 Starting - Starting the training job...
2021-08-31 07:32:07 Starting - Launching requested ML instancesProfilerReport-1630395096: InProgress
......
2021-08-31 07:33:08 Starting - Preparing the instances for training......
2021-08-31 07:34:08 Downloading - Downloading input data...
2021-08-31 07:34:28 Training - Downloading the training image..................
2021-08-31 07:37:33 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2021-08-31 07:37:34,584 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training
2021-08-31 07:37:34,615 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.
2021-08-31 07:37:36,036 sagemaker_pytorch_container.training INFO Invoking user training script.
2021-08-31 07:37:36,481 sagemaker-training-toolkit INFO Installing dependencies from requirements.txt:
/opt/conda/bin/python3.6 -m pip install -r requirements.txt
Requirement already satisfied: datasets>=1.1.3 in /opt/conda/lib/python3.6/site-packages (from -r requirements.txt (line 1)) (1.6.2)
Requirement already satisfied: sentencepiece!=0.1.92 in /opt/conda/lib/python3.6/site-packages (from -r requirements.txt (line 2)) (0.1.91)
Requirement already satisfied: protobuf in /opt/conda/lib/python3.6/site-packages (from -r requirements.txt (line 3)) (3.17.1)
Requirement already satisfied: multiprocess in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (0.70.11.1)
Requirement already satisfied: pyarrow>=1.0.0<4.0.0 in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (4.0.0)
Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (1.19.1)
Requirement already satisfied: xxhash in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (2.0.2)
Requirement already satisfied: tqdm<4.50.0,>=4.27 in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (4.49.0)
Requirement already satisfied: huggingface-hub<0.1.0 in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (0.0.8)
Requirement already satisfied: packaging in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (20.9)
Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (4.0.1)
Requirement already satisfied: fsspec in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (2021.5.0)
Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (2.25.1)
Requirement already satisfied: dataclasses in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (0.8)
Requirement already satisfied: pandas in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (1.1.5)
Requirement already satisfied: dill in /opt/conda/lib/python3.6/site-packages (from datasets>=1.1.3->-r requirements.txt (line 1)) (0.3.3)
Requirement already satisfied: filelock in /opt/conda/lib/python3.6/site-packages (from huggingface-hub<0.1.0->datasets>=1.1.3->-r requirements.txt (line 1)) (3.0.12)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests>=2.19.0->datasets>=1.1.3->-r requirements.txt (line 1)) (2020.12.5)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests>=2.19.0->datasets>=1.1.3->-r requirements.txt (line 1)) (1.25.11)
Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests>=2.19.0->datasets>=1.1.3->-r requirements.txt (line 1)) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests>=2.19.0->datasets>=1.1.3->-r requirements.txt (line 1)) (2.10)
Requirement already satisfied: six>=1.9 in /opt/conda/lib/python3.6/site-packages (from protobuf->-r requirements.txt (line 3)) (1.16.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.6/site-packages (from importlib-metadata->datasets>=1.1.3->-r requirements.txt (line 1)) (3.10.0.0)
Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.6/site-packages (from importlib-metadata->datasets>=1.1.3->-r requirements.txt (line 1)) (3.4.1)
Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/lib/python3.6/site-packages (from packaging->datasets>=1.1.3->-r requirements.txt (line 1)) (2.4.7)
Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.6/site-packages (from pandas->datasets>=1.1.3->-r requirements.txt (line 1)) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.6/site-packages (from pandas->datasets>=1.1.3->-r requirements.txt (line 1)) (2021.1)
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
2021-08-31 07:37:39,041 sagemaker-training-toolkit INFO Invoking user script
Training Env:
{
"additional_framework_parameters": {},
"channel_input_dirs": {
"training": "/opt/ml/input/data/training"
},
"current_host": "algo-1",
"framework_module": "sagemaker_pytorch_container.training:main",
"hosts": [
"algo-1"
],
"hyperparameters": {
"train_batch_size": 128,
"output_dir": "/opt/ml/model",
"epochs": 1,
"model_name_or_path": "EleutherAI/gpt-j-6B"
},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"training": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
}
},
"input_dir": "/opt/ml/input",
"is_master": true,
"job_name": "huggingface-pytorch-training-2021-08-31-07-31-36-059",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://sagemaker-us-east-1-765248384165/huggingface-pytorch-training-2021-08-31-07-31-36-059/source/sourcedir.tar.gz",
"module_name": "run_clm",
"network_interface_name": "eth0",
"num_cpus": 8,
"num_gpus": 1,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"hosts": [
"algo-1"
],
"network_interface_name": "eth0"
},
"user_entry_point": "run_clm.py"
}
Environment variables:
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS={"epochs":1,"model_name_or_path":"EleutherAI/gpt-j-6B","output_dir":"/opt/ml/model","train_batch_size":128}
SM_USER_ENTRY_POINT=run_clm.py
SM_FRAMEWORK_PARAMS={}
SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
SM_INPUT_DATA_CONFIG={"training":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["training"]
SM_CURRENT_HOST=algo-1
SM_MODULE_NAME=run_clm
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=8
SM_NUM_GPUS=1
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://sagemaker-us-east-1-765248384165/huggingface-pytorch-training-2021-08-31-07-31-36-059/source/sourcedir.tar.gz
SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"training":"/opt/ml/input/data/training"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"epochs":1,"model_name_or_path":"EleutherAI/gpt-j-6B","output_dir":"/opt/ml/model","train_batch_size":128},"input_config_dir":"/opt/ml/input/config","input_data_config":{"training":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"huggingface-pytorch-training-2021-08-31-07-31-36-059","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-765248384165/huggingface-pytorch-training-2021-08-31-07-31-36-059/source/sourcedir.tar.gz","module_name":"run_clm","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"run_clm.py"}
SM_USER_ARGS=["--epochs","1","--model_name_or_path","EleutherAI/gpt-j-6B","--output_dir","/opt/ml/model","--train_batch_size","128"]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TRAINING=/opt/ml/input/data/training
SM_HP_TRAIN_BATCH_SIZE=128
SM_HP_OUTPUT_DIR=/opt/ml/model
SM_HP_EPOCHS=1
SM_HP_MODEL_NAME_OR_PATH=EleutherAI/gpt-j-6B
PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
Invoking script with the following command:
/opt/conda/bin/python3.6 run_clm.py --epochs 1 --model_name_or_path EleutherAI/gpt-j-6B --output_dir /opt/ml/model --train_batch_size 128
Traceback (most recent call last):
File "run_clm.py", line 468, in <module>
main()
File "run_clm.py", line 182, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/opt/conda/lib/python3.6/site-packages/transformers/hf_argparser.py", line 187, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "<string>", line 12, in __init__
File "run_clm.py", line 161, in __post_init__
raise ValueError("Need either a dataset name or a training/validation file.")
ValueError: Need either a dataset name or a training/validation file.
2021-08-31 07:37:44,472 sagemaker-training-toolkit ERROR ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 run_clm.py --epochs 1 --model_name_or_path EleutherAI/gpt-j-6B --output_dir /opt/ml/model --train_batch_size 128"
Traceback (most recent call last):
File "run_clm.py", line 468, in <module>
main()
File "run_clm.py", line 182, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/opt/conda/lib/python3.6/site-packages/transformers/hf_argparser.py", line 187, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "<string>", line 12, in __init__
File "run_clm.py", line 161, in __post_init__
raise ValueError("Need either a dataset name or a training/validation file.")
ValueError: Need either a dataset name or a training/validation file.
2021-08-31 07:37:49 Uploading - Uploading generated training model
2021-08-31 07:37:49 Failed - Training job failed
---------------------------------------------------------------------------
UnexpectedStatusException Traceback (most recent call last)
<ipython-input-5-33c7f1decb60> in <module>
31
32 # starting the train job
---> 33 huggingface_estimator.fit({'training': 's3://domain-gen-data/domain-gen-training.jsonl'})
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
680 self.jobs.append(self.latest_training_job)
681 if wait:
--> 682 self.latest_training_job.wait(logs=logs)
683
684 def _compilation_job_name(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
1625 # If logs are requested, call logs_for_jobs.
1626 if logs != "None":
-> 1627 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
1628 else:
1629 self.sagemaker_session.wait_for_job(self.job_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
3731
3732 if wait:
-> 3733 self._check_job_status(job_name, description, "TrainingJobStatus")
3734 if dot:
3735 print()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
3291 ),
3292 allowed_statuses=["Completed", "Stopped"],
-> 3293 actual_status=status,
3294 )
3295
UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2021-08-31-07-31-36-059: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 run_clm.py --epochs 1 --model_name_or_path EleutherAI/gpt-j-6B --output_dir /opt/ml/model --train_batch_size 128"
Traceback (most recent call last):
File "run_clm.py", line 468, in <module>
main()
File "run_clm.py", line 182, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/opt/conda/lib/python3.6/site-packages/transformers/hf_argparser.py", line 187, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "<string>", line 12, in __init__
File "run_clm.py", line 161, in __post_init__
raise ValueError("Need either a dataset name or a training/validation file.")
ValueError: Need either a dataset name or a training/validation file.