Dear community members and @John6666
From the continuation of this problem, I have tried to update my id2label.json
using this code to get the correct labels of every image and “pour it” to json
file as in my dataset directory.
But the problem still persists and I hope some of the capable community members can help me look for the problem because I don’t know what to do anymore. I will keep looking for the solution but I guess more hands are faster than mine alone…
The error codes
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[17], line 1
----> 1 trainer.train()
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\transformers\trainer.py:2155, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2152 try:
2153 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
2154 hf_hub_utils.disable_progress_bars()
-> 2155 return inner_training_loop(
2156 args=args,
2157 resume_from_checkpoint=resume_from_checkpoint,
2158 trial=trial,
2159 ignore_keys_for_eval=ignore_keys_for_eval,
2160 )
2161 finally:
2162 hf_hub_utils.enable_progress_bars()
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\transformers\trainer.py:2522, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2516 context = (
2517 functools.partial(self.accelerator.no_sync, model=model)
2518 if i != len(batch_samples) - 1
2519 else contextlib.nullcontext
2520 )
2521 with context():
-> 2522 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2524 if (
2525 args.logging_nan_inf_filter
2526 and not is_torch_xla_available()
2527 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2528 ):
2529 # if loss is nan or inf simply add the average of previous logged losses
2530 tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\transformers\trainer.py:3655, in Trainer.training_step(self, model, inputs, num_items_in_batch)
3653 loss = self.compute_loss(model, inputs)
3654 else:
-> 3655 loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
3657 del inputs
3658 if (
3659 self.args.torch_empty_cache_steps is not None
3660 and self.state.global_step % self.args.torch_empty_cache_steps == 0
3661 ):
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\transformers\trainer.py:3709, in Trainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
3707 loss_kwargs["num_items_in_batch"] = num_items_in_batch
3708 inputs = {**inputs, **loss_kwargs}
-> 3709 outputs = model(**inputs)
3710 # Save past state if it exists
3711 # TODO: this needs to be fixed and made cleaner later.
3712 if self.args.past_index >= 0:
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\transformers\models\segformer\modeling_segformer.py:809, in SegformerForSemanticSegmentation.forward(self, pixel_values, labels, output_attentions, output_hidden_states, return_dict)
807 if self.config.num_labels > 1:
808 loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
--> 809 loss = loss_fct(upsampled_logits, labels)
810 elif self.config.num_labels == 1:
811 valid_mask = ((labels >= 0) & (labels != self.config.semantic_loss_ignore_index)).float()
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\modules\loss.py:1293, in CrossEntropyLoss.forward(self, input, target)
1292 def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1293 return F.cross_entropy(
1294 input,
1295 target,
1296 weight=self.weight,
1297 ignore_index=self.ignore_index,
1298 reduction=self.reduction,
1299 label_smoothing=self.label_smoothing,
1300 )
File c:\Users\Lenovo\miniconda3\envs\pretrain-huggingface\Lib\site-packages\torch\nn\functional.py:3479, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
3477 if size_average is not None or reduce is not None:
3478 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3479 return torch._C._nn.cross_entropy_loss(
3480 input,
3481 target,
3482 weight,
3483 _Reduction.get_enum(reduction),
3484 ignore_index,
3485 label_smoothing,
3486 )
IndexError: Target 225 is out of bounds.
The codes
model_checkpoint = "nvidia/segformer-b0-finetuned-ade-512-512"
from datasets import load_dataset
hf_dataset_identifier = "seand0101/segformer-b0-finetuned-ade-512-512-manggarai-watergate"
ds = load_dataset(hf_dataset_identifier)
ds
im_d = ds["train"]["label"][0].convert("L")
im_d
from huggingface_hub import hf_hub_download
import json
filename = "id2label.json"
id2label = json.load(
open(hf_hub_download(hf_dataset_identifier, filename, repo_type="dataset"), "r")
)
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
unique_labels = np.unique(id2label)
unique_labels
ds = ds.shuffle(seed=1)
ds = ds["train"].train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
from torchvision.transforms import ColorJitter
from transformers import SegformerFeatureExtractor
feature_extractor = SegformerFeatureExtractor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
def train_transforms(example_batch):
images = [jitter(x) for x in example_batch['pixel_values']]
labels = [x.convert("L") for x in example_batch['label']]
inputs = feature_extractor(images, labels)
return inputs
def val_transforms(example_batch):
images=[x for x in example_batch['pixel_values']]
labels = [x.convert("L") for x in example_batch['label']]
inputs = feature_extractor(images, labels)
return inputs
# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)
from transformers import SegformerForSemanticSegmentation
model = SegformerForSemanticSegmentation.from_pretrained(
model_checkpoint,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
ignore_mismatched_sizes=True, # Will ensure the segmentation specific components are reinitialized.
)
from transformers import TrainingArguments
epochs = 50
lr = 0.00006
batch_size = 2
hub_model_id = "nvidia/segformer-b0-finetuned-ade-512-512"
training_args = TrainingArguments(
"segformer-b0-finetuned-ade20k-manggarai_rivergate",
learning_rate=lr,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
save_total_limit=3,
eval_strategy="steps",
save_strategy="steps",
save_steps=20,
eval_steps=20,
logging_steps=1,
eval_accumulation_steps=5,
load_best_model_at_end=True,
push_to_hub=True,
hub_model_id=hub_model_id,
hub_strategy="end",
)
import torch
from torch import nn
import evaluate
metric = evaluate.load("mean_iou")
def compute_metrics(eval_pred):
with torch.no_grad():
logits, labels = eval_pred
logits_tensor = torch.from_numpy(logits)
# scale the logits to the size of the label
logits_tensor = nn.functional.interpolate(
logits_tensor,
size=labels.shape[-2:],
mode="bilinear",
align_corners=False,
).argmax(dim=1)
pred_labels = logits_tensor.detach().cpu().numpy()
# currently using _compute instead of compute
# see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
metrics = metric._compute(
predictions=pred_labels,
references=labels,
num_labels=len(id2label),
ignore_index=0,
reduce_labels=feature_extractor.reduce_labels,
)
# add per category metrics as individual key-value pairs
per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
per_category_iou = metrics.pop("per_category_iou").tolist()
metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
return metrics
print(len(id2label))
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
tokenizer=feature_extractor,
train_dataset=train_ds,
eval_dataset=test_ds,
compute_metrics=compute_metrics,
)
trainer.train()
I did it in jupyter notebook so sorry if the layout is bit weird. I copy everything on in case there’s something I see as a right thing to do but actually the center of the problem.
Thank you in advance-
Oh the code I use to extract the labels into id2label.json.
from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
import torch
from torchvision import transforms
from PIL import Image
# Load the pre-trained model and feature extractor
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
# example for testing a new image
image_path = large_image_stack_512[0]
image = Image.open(image_path).convert("RGB")
image
# Prepare the image by applying transformations
inputs = feature_extractor(images=image, return_tensors="pt")
# Perform inference on the image
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits # Predicted logits for each pixel
# Manually define a mapping (example, you can get this from a dataset like ADE20k)
id2label = {
"0": "wall",
"1": "building",
"2": "sky",
"3": "floor",
"4": "tree",
"5": "ceiling",
"6": "road",
"7": "bed ",
"8": "windowpane",
"9": "grass",
"10": "cabinet",
"11": "sidewalk",
"12": "person",
"13": "earth",
"14": "door",
"15": "table",
"16": "mountain",
"17": "plant",
"18": "curtain",
"19": "chair",
"20": "car",
"21": "water",
"22": "painting",
"23": "sofa",
"24": "shelf",
"25": "house",
"26": "sea",
"27": "mirror",
"28": "rug",
"29": "field",
"30": "armchair",
"31": "seat",
"32": "fence",
"33": "desk",
"34": "rock",
"35": "wardrobe",
"36": "lamp",
"37": "bathtub",
"38": "railing",
"39": "cushion",
"40": "base",
"41": "box",
"42": "column",
"43": "signboard",
"44": "chest of drawers",
"45": "counter",
"46": "sand",
"47": "sink",
"48": "skyscraper",
"49": "fireplace",
"50": "refrigerator",
"51": "grandstand",
"52": "path",
"53": "stairs",
"54": "runway",
"55": "case",
"56": "pool table",
"57": "pillow",
"58": "screen door",
"59": "stairway",
"60": "river",
"61": "bridge",
"62": "bookcase",
"63": "blind",
"64": "coffee table",
"65": "toilet",
"66": "flower",
"67": "book",
"68": "hill",
"69": "bench",
"70": "countertop",
"71": "stove",
"72": "palm",
"73": "kitchen island",
"74": "computer",
"75": "swivel chair",
"76": "boat",
"77": "bar",
"78": "arcade machine",
"79": "hovel",
"80": "bus",
"81": "towel",
"82": "light",
"83": "truck",
"84": "tower",
"85": "chandelier",
"86": "awning",
"87": "streetlight",
"88": "booth",
"89": "television receiver",
"90": "airplane",
"91": "dirt track",
"92": "apparel",
"93": "pole",
"94": "land",
"95": "bannister",
"96": "escalator",
"97": "ottoman",
"98": "bottle",
"99": "buffet",
"100": "poster",
"101": "stage",
"102": "van",
"103": "ship",
"104": "fountain",
"105": "conveyer belt",
"106": "canopy",
"107": "washer",
"108": "plaything",
"109": "swimming pool",
"110": "stool",
"111": "barrel",
"112": "basket",
"113": "waterfall",
"114": "tent",
"115": "bag",
"116": "minibike",
"117": "cradle",
"118": "oven",
"119": "ball",
"120": "food",
"121": "step",
"122": "tank",
"123": "trade name",
"124": "microwave",
"125": "pot",
"126": "animal",
"127": "bicycle",
"128": "lake",
"129": "dishwasher",
"130": "screen",
"131": "blanket",
"132": "sculpture",
"133": "hood",
"134": "sconce",
"135": "vase",
"136": "traffic light",
"137": "tray",
"138": "ashcan",
"139": "fan",
"140": "pier",
"141": "crt screen",
"142": "plate",
"143": "monitor",
"144": "bulletin board",
"145": "shower",
"146": "radiator",
"147": "glass",
"148": "clock",
"149": "flag"
}
import os
# get images mask
#Importing Dataset Pictures
ROOT_DIR = os.path.abspath("D:\\512\\512_adjust")
i = 1
large_image_stack_512= []
#print(i)
#directory = os.fsencode(ROOT_DIR)
print("Uploading Image from:", ROOT_DIR)
for file in os.listdir(ROOT_DIR):
filename = os.fsdecode(file)
if filename.endswith("jpg") or filename.endswith(".jpeg"):
file = os.path.join(ROOT_DIR, filename)
#print(filename)
large_image_stack_512.append(file)
#large_image_stack = np.array(filename)
i+=1
#print(i)
continue
else:
continue
print("Image Address Retrived:", len(large_image_stack_512))
for_json = []
json_fill_flag = 0
content_i = 0
print(len(for_json))
f = open("forjson.txt", "w")
for im in large_image_stack_512:
inputs = feature_extractor(images=Image.open(im).convert("RGB"), return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits # Predicted logits for each pixel
predicted_mask = torch.argmax(logits, dim=1).squeeze().cpu().numpy()
unique_class_indices = np.unique(predicted_mask)
print("image_path", im)
print(f"unique_class_indices: {unique_class_indices}")
for class_idx in unique_class_indices:
class_id = class_idx
image_label = id2label.get(str(class_idx))
#print(f"{class_idx}: {id2label.get(str(class_idx))}")
new_f = tuple((class_id, image_label))
if new_f not in for_json:
for_json.append(new_f)
print(for_json)
The output of the List
[(0, 'wall'),
(1, 'building'),
(2, 'sky'),
(4, 'tree'),
(13, 'earth'),
(21, 'water'),
(32, 'fence'),
(38, 'railing'),
(61, 'bridge'),
(103, 'ship'),
(140, 'pier'),
(16, 'mountain'),
(122, 'tank'),
(46, 'sand'),
(26, 'sea'),
(12, 'person'),
(94, 'land'),
(3, 'floor'),
(53, 'stairs'),
(109, 'swimming pool'),
(60, 'river'),
(34, 'rock'),
(76, 'boat'),
(6, 'road'),
(20, 'car'),
(83, 'truck')]