Spaces:

du-lab
/

MLR-Copilot

Running

App Files Files Community

MLR-Copilot / benchmarks /CLRS /env /dataset.py

Lim0011

Upload 251 files

85e3d20 verified 5 months ago

raw

history blame

12.7 kB

	# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""CLRS dataset."""

	import dataclasses

	import functools
	from typing import Iterator

	from clrs._src import probing
	from clrs._src import samplers
	from clrs._src import specs

	import jax
	import numpy as np
	import tensorflow as tf
	import tensorflow_datasets as tfds


	def _correct_axis_filtering(tensor, index, name):
	if 'hint_' in name:
	return tensor[:, index]
	else:
	return tensor[index]


	@dataclasses.dataclass
	class CLRSConfig(tfds.core.BuilderConfig):
	"""Specify the split in the variant because they have different shapes."""
	split: str = ''


	DEFAULT_BUILDER_CONFIGS = []


	def _build_default_builder_configs():
	for split in ['train', 'val', 'test']:
	for alg in specs.CLRS_30_ALGS:
	DEFAULT_BUILDER_CONFIGS.append(
	CLRSConfig(name=f'{alg}_{split}', split=split))


	_build_default_builder_configs()


	class CLRSDataset(tfds.core.GeneratorBasedBuilder):
	"""DatasetBuilder for my_dataset dataset."""

	VERSION = tfds.core.Version('1.0.0')
	RELEASE_NOTES = {
	'1.0.0': 'Initial release.',
	}
	BUILDER_CONFIGS = DEFAULT_BUILDER_CONFIGS

	_instantiated_dataset = None
	_instantiated_dataset_name = ''
	_instantiated_dataset_split = ''

	def _num_samples(self, algorithm_name):
	num_samples = samplers.CLRS30[self._builder_config.split]['num_samples'] # pytype: disable=attribute-error # always-use-return-annotations
	if self._builder_config.split != 'train': # pytype: disable=attribute-error # always-use-return-annotations
	# Generate more samples for those algorithms in which the number of
	# signals is small.
	num_samples *= specs.CLRS_30_ALGS_SETTINGS[algorithm_name][
	'num_samples_multiplier']
	return num_samples

	def _create_data(self, single_sample):
	algorithm_name = '_'.join(self._builder_config.name.split('_')[:-1])
	num_samples = self._num_samples(algorithm_name)
	sampler, _ = samplers.build_sampler(
	algorithm_name,
	seed=samplers.CLRS30[self._builder_config.split]['seed'], # pytype: disable=attribute-error # always-use-return-annotations
	num_samples=num_samples,
	length=samplers.CLRS30[self._builder_config.split]['length'], # pytype: disable=attribute-error # always-use-return-annotations
	)
	sampled_dataset = sampler.next(batch_size=1 if single_sample else None)
	data = {'input_' + t.name: t.data for t in sampled_dataset.features.inputs}
	# All other data points have input_, hint_, and output_ prefixes, so we
	# guarantee that this key is unused.
	data['lengths'] = sampled_dataset.features.lengths
	data.update({'output_' + t.name: t.data for t in sampled_dataset.outputs})
	data.update({
	'hint_' + t.name: t.data for t in sampled_dataset.features.hints})
	self._instantiated_dataset = data

	def _info(self) -> tfds.core.DatasetInfo:
	if tf.io.gfile.exists(self.data_dir):
	info = tfds.core.DatasetInfo(builder=self)
	info.read_from_directory(self.data_dir)
	return info

	if (self._instantiated_dataset_name != self._builder_config.name
	or self._instantiated_dataset_split != self._builder_config.split): # pytype: disable=attribute-error # always-use-return-annotations
	self._create_data(single_sample=True)

	data = {k: _correct_axis_filtering(v, 0, k)
	for k, v in self._instantiated_dataset.items()}
	data_info = {
	k: tfds.features.Tensor(shape=v.shape, dtype=tf.dtypes.as_dtype(
	v.dtype)) for k, v in data.items()}
	return tfds.core.DatasetInfo(
	builder=self,
	features=tfds.features.FeaturesDict(data_info),
	)

	def _split_generators(self, dl_manager: tfds.download.DownloadManager):
	"""Download the data and define splits."""
	if (self._instantiated_dataset_name != self._builder_config.name
	or self._instantiated_dataset_split != self._builder_config.split): # pytype: disable=attribute-error # always-use-return-annotations
	self._create_data(single_sample=False)
	self._instantiated_dataset_name = self._builder_config.name
	self._instantiated_dataset_split = self._builder_config.split # pytype: disable=attribute-error # always-use-return-annotations
	return {self._builder_config.split: self._generate_examples()} # pytype: disable=attribute-error # always-use-return-annotations

	def _generate_examples(self):
	"""Generator of examples for each split."""
	algorithm_name = '_'.join(self._builder_config.name.split('_')[:-1])
	for i in range(self._num_samples(algorithm_name)):
	data = {k: _correct_axis_filtering(v, i, k)
	for k, v in self._instantiated_dataset.items()}
	yield str(i), data


	def _get_clrs_file_name():
	return f'CLRS30_v{CLRSDataset.VERSION}.tar.gz'


	def get_dataset_gcp_url():
	return f'https://storage.googleapis.com/dm-clrs/{_get_clrs_file_name()}'


	def get_clrs_folder():
	return f'CLRS30_v{CLRSDataset.VERSION}'


	def _preprocess(data_point, algorithm=None):
	"""Convert sampled inputs into DataPoints."""
	inputs = []
	outputs = []
	hints = []
	lengths = None

	for name, data in data_point.items():
	if name == 'lengths':
	lengths = data
	continue
	data_point_name = name.split('_')
	name = '_'.join(data_point_name[1:])
	(stage, location, dp_type) = specs.SPECS[algorithm][name]
	assert stage == data_point_name[0]
	if stage == specs.Stage.HINT:
	data = tf.experimental.numpy.swapaxes(data, 0, 1)
	dp = probing.DataPoint(name, location, dp_type, data)
	if stage == specs.Stage.INPUT:
	inputs.append(dp)
	elif stage == specs.Stage.OUTPUT:
	outputs.append(dp)
	else:
	hints.append(dp)
	return samplers.Feedback(
	samplers.Features(tuple(inputs), tuple(hints), lengths), tuple(outputs))


	def create_dataset(folder, algorithm, split, batch_size):
	dataset = tfds.load(f'clrs_dataset/{algorithm}_{split}',
	data_dir=folder, split=split)
	num_samples = len(dataset) # Must be done here for correct size
	dataset = dataset.repeat()
	dataset = dataset.batch(batch_size)
	return (dataset.map(lambda d: _preprocess(d, algorithm=algorithm)),
	num_samples,
	specs.SPECS[algorithm])


	def _copy_hint(source, dest, i, start_source, start_dest, to_add):
	"""Copy from full-sample hint to a hint chunk."""
	assert np.all(dest[start_dest:, i:] == 0)
	assert start_dest < dest.shape[0]
	assert start_dest + to_add <= dest.shape[0]
	assert start_source < source.shape[0]
	assert start_source + to_add <= source.shape[0]
	dest[start_dest:start_dest+to_add, i] = source[
	start_source:start_source+to_add, i]
	return dest


	def _copy_io(source, dest, i, start_dest, to_add):
	"""Copy from an input or output to an input or output chunk."""
	assert np.all(dest[start_dest:, i:] == 0)
	dest[start_dest:start_dest+to_add, i] = source[i]
	return dest


	def chunkify(dataset: Iterator[samplers.Feedback], chunk_length: int):
	"""Generator of fixed-length chunks from full-trajectory samples.

	Args:
	dataset: full-sample dataset as numpy iterator.
	chunk_length: time length of chunks.
	Yields:
	Fixed-timelength chunks of data. Each tensor of inputs, hints and outputs
	has dimensions chunk_length x batch_size x ... Samples are not time-padded,
	after the end of one sample immediately comes the next. Since different
	samples can have different time lengths, the beginnings and ends of samples
	within a batch do not need to coincide. For this reason, the chunked
	dataset features include two chunk_length x batch_size int tensors,
	`is_first` and `is_last`, that mark the beginning and end of each sample.
	For example, if `chunk_legnth`==6 and `batch_size`==2 and the first
	full-sample batch had one sample of length 3 and one of length 5,
	we would have a first chunked batch with the following `is_first` and
	`is_last` tensors:

	is_first = [[1, 1] is_last = [[0, 0] ( sample id [[0 1]
	[0, 0] [0, 0] [0 1]
	[0, 0] [1, 0] [0 1]
	[1, 0] [0, 0] [2 1]
	[0, 0] [0, 1] [2 1]
	[0, 1]] [0, 0]] [2 3]] )

	while the data in the inputs, outputs and hints tensors would correspond
	to samples as identified by the sample_id indicated above for reference.
	Notice that, while in the full-sample dataset inputs and outputs have
	no time dimension, here they do; the input and output tensors are simply
	repeated along each sample's time length.
	"""
	def _get_batch():
	d = next(dataset)
	return (d.features.inputs, d.features.hints, d.outputs,
	d.features.lengths.astype(int))

	inputs, hints, outputs, lengths = _get_batch()
	for inp in inputs:
	if inp.location in [specs.Location.NODE, specs.Location.EDGE]:
	batch_size = inp.data.shape[0]
	break

	io_chunk = lambda x: np.zeros((chunk_length,) + x.shape, dtype=x.dtype)
	chunk_inputs = jax.tree_util.tree_map(io_chunk, inputs)
	chunk_outputs = jax.tree_util.tree_map(io_chunk, outputs)

	hint_chunk = lambda x: np.zeros((chunk_length,) + x.shape[1:], dtype=x.dtype)
	chunk_hints = jax.tree_util.tree_map(hint_chunk, hints)

	inputs = [inputs]
	hints = [hints]
	outputs = [outputs]
	left = [lengths.copy()]
	lengths = [lengths.copy()]

	while True:
	# Create a new empty chunk
	chunk_inputs = jax.tree_util.tree_map(np.zeros_like, chunk_inputs)
	chunk_hints = jax.tree_util.tree_map(np.zeros_like, chunk_hints)
	chunk_outputs = jax.tree_util.tree_map(np.zeros_like, chunk_outputs)
	start_mark = np.zeros((chunk_length, batch_size), dtype=int)
	end_mark = np.zeros((chunk_length, batch_size), dtype=int)

	# Get enough data batches to fill the new chunk
	while np.any(np.sum(left, axis=0) < chunk_length):
	inp, hh, out, ll = _get_batch()
	inputs.append(inp)
	hints.append(hh)
	outputs.append(out)
	left.append(ll.copy())
	lengths.append(ll.copy())

	# Fill the chunk, one batch element at a time
	for i in range(batch_size):
	total, idx = 0, 0
	while total < chunk_length:
	to_add = min(left[idx][i], chunk_length - total)
	if to_add:
	start = lengths[idx][i] - left[idx][i]
	assert start >= 0
	f_io = functools.partial(_copy_io, i=i, start_dest=total,
	to_add=to_add)
	chunk_inputs = jax.tree_util.tree_map(f_io, inputs[idx], chunk_inputs)
	chunk_outputs = jax.tree_util.tree_map(f_io, outputs[idx],
	chunk_outputs)
	f_hint = functools.partial(_copy_hint, i=i, start_source=start,
	start_dest=total, to_add=to_add)
	chunk_hints = jax.tree_util.tree_map(f_hint, hints[idx], chunk_hints)
	if start == 0:
	start_mark[total, i] = 1
	total += to_add
	left[idx][i] -= to_add
	assert left[idx][i] >= 0
	if left[idx][i] == 0:
	end_mark[total - 1, i] = 1
	idx += 1
	assert total == chunk_length

	while left and np.all(left[0] == 0):
	inputs.pop(0)
	hints.pop(0)
	outputs.pop(0)
	left.pop(0)
	lengths.pop(0)

	yield samplers.Feedback(
	samplers.FeaturesChunked(chunk_inputs, chunk_hints,
	start_mark, end_mark),
	chunk_outputs)


	def create_chunked_dataset(folder, algorithm, split, batch_size, chunk_length):
	dataset = tfds.load(f'clrs_dataset/{algorithm}_{split}',
	data_dir=folder, split=split)
	dataset = dataset.repeat()
	dataset = dataset.batch(batch_size)
	dataset = dataset.map(lambda d: _preprocess(d, algorithm=algorithm))
	dataset = dataset.as_numpy_iterator()
	return chunkify(dataset, chunk_length), specs.SPECS[algorithm]