I am trying to convert urls to from an image dataset to PIL type image, on huggingface platform I can see both image and url so when i only want image not url I tried to download it and use cast_column to convert it as follow:
from datasets import load_dataset
from datasets import Dataset
import datasets
dataset = load_dataset("mcemilg/laion2B-multi-turkish-subset", split = "train[:30000]")
dataset = dataset.cast_column("URL", datasets.Image())
dataset.push_to_hub("umarigan/clip_dataset")
Since some urls doesnt exist anymore it returns error, I couldn’t find to quickly eliminate that kind of url for bigger image dataset. The error I am facing is as follow:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-19-6c0f55a01058> in <cell line: 1>()
----> 1 dataset['train'][8]
11 frames
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in __getitem__(self, key)
2798 def __getitem__(self, key): # noqa: F811
2799 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 2800 return self._getitem(key)
2801
2802 def __getitems__(self, keys: List) -> List:
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in _getitem(self, key, **kwargs)
2783 formatter = get_formatter(format_type, features=self._info.features, **format_kwargs)
2784 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 2785 formatted_output = format_table(
2786 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
2787 )
/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in format_table(table, key, formatter, format_columns, output_all_columns)
627 python_formatter = PythonFormatter(features=formatter.features)
628 if format_columns is None:
--> 629 return formatter(pa_table, query_type=query_type)
630 elif query_type == "column":
631 if key in format_columns:
/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in __call__(self, pa_table, query_type)
394 def __call__(self, pa_table: pa.Table, query_type: str) -> Union[RowFormat, ColumnFormat, BatchFormat]:
395 if query_type == "row":
--> 396 return self.format_row(pa_table)
397 elif query_type == "column":
398 return self.format_column(pa_table)
/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in format_row(self, pa_table)
435 return LazyRow(pa_table, self)
436 row = self.python_arrow_extractor().extract_row(pa_table)
--> 437 row = self.python_features_decoder.decode_row(row)
438 return row
439
/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in decode_row(self, row)
213
214 def decode_row(self, row: dict) -> dict:
--> 215 return self.features.decode_example(row) if self.features else row
216
217 def decode_column(self, column: list, column_name: str) -> list:
/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in decode_example(self, example, token_per_repo_id)
1927 """
1928
-> 1929 return {
1930 column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
1931 if self._column_requires_decoding[column_name]
/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in <dictcomp>(.0)
1928
1929 return {
-> 1930 column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
1931 if self._column_requires_decoding[column_name]
1932 else value
/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in decode_nested_example(schema, obj, token_per_repo_id)
1337 # we pass the token to read and decode files from private repositories in streaming mode
1338 if obj is not None and schema.decode:
-> 1339 return schema.decode_example(obj, token_per_repo_id=token_per_repo_id)
1340 return obj
1341
/usr/local/lib/python3.10/dist-packages/datasets/features/image.py in decode_example(self, value, token_per_repo_id)
178 token = None
179 download_config = DownloadConfig(token=token)
--> 180 with xopen(path, "rb", download_config=download_config) as f:
181 bytes_ = BytesIO(f.read())
182 image = PIL.Image.open(bytes_)
/usr/local/lib/python3.10/dist-packages/datasets/download/streaming_download_manager.py in xopen(file, mode, download_config, *args, **kwargs)
504 kwargs = {**kwargs, **(storage_options or {})}
505 try:
--> 506 file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
507 except ValueError as e:
508 if str(e) == "Cannot seek streaming HTTP file":
/usr/local/lib/python3.10/dist-packages/fsspec/core.py in open(urlpath, mode, compression, encoding, errors, protocol, newline, **kwargs)
449 )
450 if not out:
--> 451 raise FileNotFoundError(urlpath)
452 return out[0]
453
FileNotFoundError: https://i0.wp.com/webadubradio.fr/wp-content/uploads/2018/12/webadubradio.fr-ogc-nice-antillais.png?resize=364%2C205&ssl=1```
I want to be able to eliminate urls that doesnt exist or cant be cast as Image type.
dataset version: 2.16.1
enviroment: Google Colab