reach-vb HF staff commited on
Commit
6017349
·
1 Parent(s): 3962680

8643e0bff997343019440dbb643d796bcec3eb5d247e4cbf77173c5941ca163f

Browse files
Files changed (50) hide show
  1. lib/python3.11/site-packages/fsspec/fuse.py +324 -0
  2. lib/python3.11/site-packages/fsspec/generic.py +403 -0
  3. lib/python3.11/site-packages/fsspec/gui.py +413 -0
  4. lib/python3.11/site-packages/fsspec/implementations/__init__.py +0 -0
  5. lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc +0 -0
  6. lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc +0 -0
  7. lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc +0 -0
  8. lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc +0 -0
  9. lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc +0 -0
  10. lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc +0 -0
  11. lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc +0 -0
  12. lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc +0 -0
  13. lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc +0 -0
  14. lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc +0 -0
  15. lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc +0 -0
  16. lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc +0 -0
  17. lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc +0 -0
  18. lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc +0 -0
  19. lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc +0 -0
  20. lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc +0 -0
  21. lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc +0 -0
  22. lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc +0 -0
  23. lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc +0 -0
  24. lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc +0 -0
  25. lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc +0 -0
  26. lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc +0 -0
  27. lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc +0 -0
  28. lib/python3.11/site-packages/fsspec/implementations/arrow.py +297 -0
  29. lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py +80 -0
  30. lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py +232 -0
  31. lib/python3.11/site-packages/fsspec/implementations/cached.py +864 -0
  32. lib/python3.11/site-packages/fsspec/implementations/dask.py +152 -0
  33. lib/python3.11/site-packages/fsspec/implementations/data.py +48 -0
  34. lib/python3.11/site-packages/fsspec/implementations/dbfs.py +457 -0
  35. lib/python3.11/site-packages/fsspec/implementations/dirfs.py +358 -0
  36. lib/python3.11/site-packages/fsspec/implementations/ftp.py +380 -0
  37. lib/python3.11/site-packages/fsspec/implementations/git.py +127 -0
  38. lib/python3.11/site-packages/fsspec/implementations/github.py +215 -0
  39. lib/python3.11/site-packages/fsspec/implementations/http.py +864 -0
  40. lib/python3.11/site-packages/fsspec/implementations/jupyter.py +124 -0
  41. lib/python3.11/site-packages/fsspec/implementations/libarchive.py +213 -0
  42. lib/python3.11/site-packages/fsspec/implementations/local.py +414 -0
  43. lib/python3.11/site-packages/fsspec/implementations/memory.py +292 -0
  44. lib/python3.11/site-packages/fsspec/implementations/reference.py +1122 -0
  45. lib/python3.11/site-packages/fsspec/implementations/sftp.py +180 -0
  46. lib/python3.11/site-packages/fsspec/implementations/smb.py +324 -0
  47. lib/python3.11/site-packages/fsspec/implementations/tar.py +124 -0
  48. lib/python3.11/site-packages/fsspec/implementations/webhdfs.py +467 -0
  49. lib/python3.11/site-packages/fsspec/implementations/zip.py +133 -0
  50. lib/python3.11/site-packages/fsspec/mapping.py +247 -0
lib/python3.11/site-packages/fsspec/fuse.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import stat
5
+ import threading
6
+ import time
7
+ from errno import EIO, ENOENT
8
+
9
+ from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
10
+
11
+ from fsspec import __version__
12
+ from fsspec.core import url_to_fs
13
+
14
+ logger = logging.getLogger("fsspec.fuse")
15
+
16
+
17
+ class FUSEr(Operations):
18
+ def __init__(self, fs, path, ready_file=False):
19
+ self.fs = fs
20
+ self.cache = {}
21
+ self.root = path.rstrip("/") + "/"
22
+ self.counter = 0
23
+ logger.info("Starting FUSE at %s", path)
24
+ self._ready_file = ready_file
25
+
26
+ def getattr(self, path, fh=None):
27
+ logger.debug("getattr %s", path)
28
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
29
+ return {"type": "file", "st_size": 5}
30
+
31
+ path = "".join([self.root, path.lstrip("/")]).rstrip("/")
32
+ try:
33
+ info = self.fs.info(path)
34
+ except FileNotFoundError:
35
+ raise FuseOSError(ENOENT)
36
+
37
+ data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
38
+ perm = info.get("mode", 0o777)
39
+
40
+ if info["type"] != "file":
41
+ data["st_mode"] = stat.S_IFDIR | perm
42
+ data["st_size"] = 0
43
+ data["st_blksize"] = 0
44
+ else:
45
+ data["st_mode"] = stat.S_IFREG | perm
46
+ data["st_size"] = info["size"]
47
+ data["st_blksize"] = 5 * 2**20
48
+ data["st_nlink"] = 1
49
+ data["st_atime"] = info["atime"] if "atime" in info else time.time()
50
+ data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
51
+ data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
52
+ return data
53
+
54
+ def readdir(self, path, fh):
55
+ logger.debug("readdir %s", path)
56
+ path = "".join([self.root, path.lstrip("/")])
57
+ files = self.fs.ls(path, False)
58
+ files = [os.path.basename(f.rstrip("/")) for f in files]
59
+ return [".", ".."] + files
60
+
61
+ def mkdir(self, path, mode):
62
+ path = "".join([self.root, path.lstrip("/")])
63
+ self.fs.mkdir(path)
64
+ return 0
65
+
66
+ def rmdir(self, path):
67
+ path = "".join([self.root, path.lstrip("/")])
68
+ self.fs.rmdir(path)
69
+ return 0
70
+
71
+ def read(self, path, size, offset, fh):
72
+ logger.debug("read %s", (path, size, offset))
73
+ if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
74
+ # status indicator
75
+ return b"ready"
76
+
77
+ f = self.cache[fh]
78
+ f.seek(offset)
79
+ out = f.read(size)
80
+ return out
81
+
82
+ def write(self, path, data, offset, fh):
83
+ logger.debug("write %s", (path, offset))
84
+ f = self.cache[fh]
85
+ f.seek(offset)
86
+ f.write(data)
87
+ return len(data)
88
+
89
+ def create(self, path, flags, fi=None):
90
+ logger.debug("create %s", (path, flags))
91
+ fn = "".join([self.root, path.lstrip("/")])
92
+ self.fs.touch(fn) # OS will want to get attributes immediately
93
+ f = self.fs.open(fn, "wb")
94
+ self.cache[self.counter] = f
95
+ self.counter += 1
96
+ return self.counter - 1
97
+
98
+ def open(self, path, flags):
99
+ logger.debug("open %s", (path, flags))
100
+ fn = "".join([self.root, path.lstrip("/")])
101
+ if flags % 2 == 0:
102
+ # read
103
+ mode = "rb"
104
+ else:
105
+ # write/create
106
+ mode = "wb"
107
+ self.cache[self.counter] = self.fs.open(fn, mode)
108
+ self.counter += 1
109
+ return self.counter - 1
110
+
111
+ def truncate(self, path, length, fh=None):
112
+ fn = "".join([self.root, path.lstrip("/")])
113
+ if length != 0:
114
+ raise NotImplementedError
115
+ # maybe should be no-op since open with write sets size to zero anyway
116
+ self.fs.touch(fn)
117
+
118
+ def unlink(self, path):
119
+ fn = "".join([self.root, path.lstrip("/")])
120
+ try:
121
+ self.fs.rm(fn, False)
122
+ except (OSError, FileNotFoundError):
123
+ raise FuseOSError(EIO)
124
+
125
+ def release(self, path, fh):
126
+ try:
127
+ if fh in self.cache:
128
+ f = self.cache[fh]
129
+ f.close()
130
+ self.cache.pop(fh)
131
+ except Exception as e:
132
+ print(e)
133
+ return 0
134
+
135
+ def chmod(self, path, mode):
136
+ if hasattr(self.fs, "chmod"):
137
+ path = "".join([self.root, path.lstrip("/")])
138
+ return self.fs.chmod(path, mode)
139
+ raise NotImplementedError
140
+
141
+
142
+ def run(
143
+ fs,
144
+ path,
145
+ mount_point,
146
+ foreground=True,
147
+ threads=False,
148
+ ready_file=False,
149
+ ops_class=FUSEr,
150
+ ):
151
+ """Mount stuff in a local directory
152
+
153
+ This uses fusepy to make it appear as if a given path on an fsspec
154
+ instance is in fact resident within the local file-system.
155
+
156
+ This requires that fusepy by installed, and that FUSE be available on
157
+ the system (typically requiring a package to be installed with
158
+ apt, yum, brew, etc.).
159
+
160
+ Parameters
161
+ ----------
162
+ fs: file-system instance
163
+ From one of the compatible implementations
164
+ path: str
165
+ Location on that file-system to regard as the root directory to
166
+ mount. Note that you typically should include the terminating "/"
167
+ character.
168
+ mount_point: str
169
+ An empty directory on the local file-system where the contents of
170
+ the remote path will appear.
171
+ foreground: bool
172
+ Whether or not calling this function will block. Operation will
173
+ typically be more stable if True.
174
+ threads: bool
175
+ Whether or not to create threads when responding to file operations
176
+ within the mounter directory. Operation will typically be more
177
+ stable if False.
178
+ ready_file: bool
179
+ Whether the FUSE process is ready. The ``.fuse_ready`` file will
180
+ exist in the ``mount_point`` directory if True. Debugging purpose.
181
+ ops_class: FUSEr or Subclass of FUSEr
182
+ To override the default behavior of FUSEr. For Example, logging
183
+ to file.
184
+
185
+ """
186
+ func = lambda: FUSE(
187
+ ops_class(fs, path, ready_file=ready_file),
188
+ mount_point,
189
+ nothreads=not threads,
190
+ foreground=foreground,
191
+ )
192
+ if not foreground:
193
+ th = threading.Thread(target=func)
194
+ th.daemon = True
195
+ th.start()
196
+ return th
197
+ else: # pragma: no cover
198
+ try:
199
+ func()
200
+ except KeyboardInterrupt:
201
+ pass
202
+
203
+
204
+ def main(args):
205
+ """Mount filesystem from chained URL to MOUNT_POINT.
206
+
207
+ Examples:
208
+
209
+ python3 -m fsspec.fuse memory /usr/share /tmp/mem
210
+
211
+ python3 -m fsspec.fuse local /tmp/source /tmp/local \\
212
+ -l /tmp/fsspecfuse.log
213
+
214
+ You can also mount chained-URLs and use special settings:
215
+
216
+ python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
217
+ / /tmp/zip \\
218
+ -o 'filecache-cache_storage=/tmp/simplecache'
219
+
220
+ You can specify the type of the setting by using `[int]` or `[bool]`,
221
+ (`true`, `yes`, `1` represents the Boolean value `True`):
222
+
223
+ python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
224
+ /historic/packages/RPMS /tmp/ftp \\
225
+ -o 'simplecache-cache_storage=/tmp/simplecache' \\
226
+ -o 'simplecache-check_files=false[bool]' \\
227
+ -o 'ftp-listings_expiry_time=60[int]' \\
228
+ -o 'ftp-username=anonymous' \\
229
+ -o 'ftp-password=xieyanbo'
230
+ """
231
+
232
+ class RawDescriptionArgumentParser(argparse.ArgumentParser):
233
+ def format_help(self):
234
+ usage = super().format_help()
235
+ parts = usage.split("\n\n")
236
+ parts[1] = self.description.rstrip()
237
+ return "\n\n".join(parts)
238
+
239
+ parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
240
+ parser.add_argument("--version", action="version", version=__version__)
241
+ parser.add_argument("url", type=str, help="fs url")
242
+ parser.add_argument("source_path", type=str, help="source directory in fs")
243
+ parser.add_argument("mount_point", type=str, help="local directory")
244
+ parser.add_argument(
245
+ "-o",
246
+ "--option",
247
+ action="append",
248
+ help="Any options of protocol included in the chained URL",
249
+ )
250
+ parser.add_argument(
251
+ "-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
252
+ )
253
+ parser.add_argument(
254
+ "-f",
255
+ "--foreground",
256
+ action="store_false",
257
+ help="Running in foreground or not (Default: False)",
258
+ )
259
+ parser.add_argument(
260
+ "-t",
261
+ "--threads",
262
+ action="store_false",
263
+ help="Running with threads support (Default: False)",
264
+ )
265
+ parser.add_argument(
266
+ "-r",
267
+ "--ready-file",
268
+ action="store_false",
269
+ help="The `.fuse_ready` file will exist after FUSE is ready. "
270
+ "(Debugging purpose, Default: False)",
271
+ )
272
+ args = parser.parse_args(args)
273
+
274
+ kwargs = {}
275
+ for item in args.option or []:
276
+ key, sep, value = item.partition("=")
277
+ if not sep:
278
+ parser.error(message=f"Wrong option: {item!r}")
279
+ val = value.lower()
280
+ if val.endswith("[int]"):
281
+ value = int(value[: -len("[int]")])
282
+ elif val.endswith("[bool]"):
283
+ value = val[: -len("[bool]")] in ["1", "yes", "true"]
284
+
285
+ if "-" in key:
286
+ fs_name, setting_name = key.split("-", 1)
287
+ if fs_name in kwargs:
288
+ kwargs[fs_name][setting_name] = value
289
+ else:
290
+ kwargs[fs_name] = {setting_name: value}
291
+ else:
292
+ kwargs[key] = value
293
+
294
+ if args.log_file:
295
+ logging.basicConfig(
296
+ level=logging.DEBUG,
297
+ filename=args.log_file,
298
+ format="%(asctime)s %(message)s",
299
+ )
300
+
301
+ class LoggingFUSEr(FUSEr, LoggingMixIn):
302
+ pass
303
+
304
+ fuser = LoggingFUSEr
305
+ else:
306
+ fuser = FUSEr
307
+
308
+ fs, url_path = url_to_fs(args.url, **kwargs)
309
+ logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
310
+ run(
311
+ fs,
312
+ args.source_path,
313
+ args.mount_point,
314
+ foreground=args.foreground,
315
+ threads=args.threads,
316
+ ready_file=args.ready_file,
317
+ ops_class=fuser,
318
+ )
319
+
320
+
321
+ if __name__ == "__main__":
322
+ import sys
323
+
324
+ main(sys.argv[1:])
lib/python3.11/site-packages/fsspec/generic.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from typing import Optional
9
+
10
+ from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
11
+ from .callbacks import _DEFAULT_CALLBACK
12
+ from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
13
+
14
+ _generic_fs = {}
15
+ logger = logging.getLogger("fsspec.generic")
16
+
17
+
18
+ def set_generic_fs(protocol, **storage_options):
19
+ _generic_fs[protocol] = filesystem(protocol, **storage_options)
20
+
21
+
22
+ default_method = "default"
23
+
24
+
25
+ def _resolve_fs(url, method=None, protocol=None, storage_options=None):
26
+ """Pick instance of backend FS"""
27
+ method = method or default_method
28
+ protocol = protocol or split_protocol(url)[0]
29
+ storage_options = storage_options or {}
30
+ if method == "default":
31
+ return filesystem(protocol)
32
+ if method == "generic":
33
+ return _generic_fs[protocol]
34
+ if method == "current":
35
+ cls = get_filesystem_class(protocol)
36
+ return cls.current()
37
+ if method == "options":
38
+ fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
39
+ return fs
40
+ raise ValueError(f"Unknown FS resolution method: {method}")
41
+
42
+
43
+ def rsync(
44
+ source,
45
+ destination,
46
+ delete_missing=False,
47
+ source_field="size",
48
+ dest_field="size",
49
+ update_cond="different",
50
+ inst_kwargs=None,
51
+ fs=None,
52
+ **kwargs,
53
+ ):
54
+ """Sync files between two directory trees
55
+
56
+ (experimental)
57
+
58
+ Parameters
59
+ ----------
60
+ source: str
61
+ Root of the directory tree to take files from. This must be a directory, but
62
+ do not include any terminating "/" character
63
+ destination: str
64
+ Root path to copy into. The contents of this location should be
65
+ identical to the contents of ``source`` when done. This will be made a
66
+ directory, and the terminal "/" should not be included.
67
+ delete_missing: bool
68
+ If there are paths in the destination that don't exist in the
69
+ source and this is True, delete them. Otherwise, leave them alone.
70
+ source_field: str | callable
71
+ If ``update_field`` is "different", this is the key in the info
72
+ of source files to consider for difference. Maybe a function of the
73
+ info dict.
74
+ dest_field: str | callable
75
+ If ``update_field`` is "different", this is the key in the info
76
+ of destination files to consider for difference. May be a function of
77
+ the info dict.
78
+ update_cond: "different"|"always"|"never"
79
+ If "always", every file is copied, regardless of whether it exists in
80
+ the destination. If "never", files that exist in the destination are
81
+ not copied again. If "different" (default), only copy if the info
82
+ fields given by ``source_field`` and ``dest_field`` (usually "size")
83
+ are different. Other comparisons may be added in the future.
84
+ inst_kwargs: dict|None
85
+ If ``fs`` is None, use this set of keyword arguments to make a
86
+ GenericFileSystem instance
87
+ fs: GenericFileSystem|None
88
+ Instance to use if explicitly given. The instance defines how to
89
+ to make downstream file system instances from paths.
90
+ """
91
+ fs = fs or GenericFileSystem(**(inst_kwargs or {}))
92
+ source = fs._strip_protocol(source)
93
+ destination = fs._strip_protocol(destination)
94
+ allfiles = fs.find(source, withdirs=True, detail=True)
95
+ if not fs.isdir(source):
96
+ raise ValueError("Can only rsync on a directory")
97
+ otherfiles = fs.find(destination, withdirs=True, detail=True)
98
+ dirs = [
99
+ a
100
+ for a, v in allfiles.items()
101
+ if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
102
+ ]
103
+ logger.debug(f"{len(dirs)} directories to create")
104
+ if dirs:
105
+ fs.make_many_dirs(
106
+ [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
107
+ )
108
+ allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
109
+ logger.debug(f"{len(allfiles)} files to consider for copy")
110
+ to_delete = [
111
+ o
112
+ for o, v in otherfiles.items()
113
+ if o.replace(destination, source) not in allfiles and v["type"] == "file"
114
+ ]
115
+ for k, v in allfiles.copy().items():
116
+ otherfile = k.replace(source, destination)
117
+ if otherfile in otherfiles:
118
+ if update_cond == "always":
119
+ allfiles[k] = otherfile
120
+ elif update_cond == "different":
121
+ inf1 = source_field(v) if callable(source_field) else v[source_field]
122
+ v2 = otherfiles[otherfile]
123
+ inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
124
+ if inf1 != inf2:
125
+ # details mismatch, make copy
126
+ allfiles[k] = otherfile
127
+ else:
128
+ # details match, don't copy
129
+ allfiles.pop(k)
130
+ else:
131
+ # file not in target yet
132
+ allfiles[k] = otherfile
133
+ logger.debug(f"{len(allfiles)} files to copy")
134
+ if allfiles:
135
+ source_files, target_files = zip(*allfiles.items())
136
+ fs.cp(source_files, target_files, **kwargs)
137
+ logger.debug(f"{len(to_delete)} files to delete")
138
+ if delete_missing:
139
+ fs.rm(to_delete)
140
+
141
+
142
+ class GenericFileSystem(AsyncFileSystem):
143
+ """Wrapper over all other FS types
144
+
145
+ <experimental!>
146
+
147
+ This implementation is a single unified interface to be able to run FS operations
148
+ over generic URLs, and dispatch to the specific implementations using the URL
149
+ protocol prefix.
150
+
151
+ Note: instances of this FS are always async, even if you never use it with any async
152
+ backend.
153
+ """
154
+
155
+ protocol = "generic" # there is no real reason to ever use a protocol with this FS
156
+
157
+ def __init__(self, default_method="default", **kwargs):
158
+ """
159
+
160
+ Parameters
161
+ ----------
162
+ default_method: str (optional)
163
+ Defines how to configure backend FS instances. Options are:
164
+ - "default": instantiate like FSClass(), with no
165
+ extra arguments; this is the default instance of that FS, and can be
166
+ configured via the config system
167
+ - "generic": takes instances from the `_generic_fs` dict in this module,
168
+ which you must populate before use. Keys are by protocol
169
+ - "current": takes the most recently instantiated version of each FS
170
+ """
171
+ self.method = default_method
172
+ super().__init__(**kwargs)
173
+
174
+ def _parent(self, path):
175
+ fs = _resolve_fs(path, self.method)
176
+ return fs.unstrip_protocol(fs._parent(path))
177
+
178
+ def _strip_protocol(self, path):
179
+ # normalization only
180
+ fs = _resolve_fs(path, self.method)
181
+ return fs.unstrip_protocol(fs._strip_protocol(path))
182
+
183
+ async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
184
+ fs = _resolve_fs(path, self.method)
185
+ if fs.async_impl:
186
+ out = await fs._find(
187
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
188
+ )
189
+ else:
190
+ out = fs.find(
191
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
192
+ )
193
+ result = {}
194
+ for k, v in out.items():
195
+ name = fs.unstrip_protocol(k)
196
+ v["name"] = name
197
+ result[name] = v
198
+ if detail:
199
+ return result
200
+ return list(result)
201
+
202
+ async def _info(self, url, **kwargs):
203
+ fs = _resolve_fs(url, self.method)
204
+ if fs.async_impl:
205
+ out = await fs._info(url, **kwargs)
206
+ else:
207
+ out = fs.info(url, **kwargs)
208
+ out["name"] = fs.unstrip_protocol(out["name"])
209
+ return out
210
+
211
+ async def _ls(
212
+ self,
213
+ url,
214
+ detail=True,
215
+ **kwargs,
216
+ ):
217
+ fs = _resolve_fs(url, self.method)
218
+ if fs.async_impl:
219
+ out = await fs._ls(url, detail=True, **kwargs)
220
+ else:
221
+ out = fs.ls(url, detail=True, **kwargs)
222
+ for o in out:
223
+ o["name"] = fs.unstrip_protocol(o["name"])
224
+ if detail:
225
+ return out
226
+ else:
227
+ return [o["name"] for o in out]
228
+
229
+ async def _cat_file(
230
+ self,
231
+ url,
232
+ **kwargs,
233
+ ):
234
+ fs = _resolve_fs(url, self.method)
235
+ if fs.async_impl:
236
+ return await fs._cat_file(url, **kwargs)
237
+ else:
238
+ return fs.cat_file(url, **kwargs)
239
+
240
+ async def _pipe_file(
241
+ self,
242
+ path,
243
+ value,
244
+ **kwargs,
245
+ ):
246
+ fs = _resolve_fs(path, self.method)
247
+ if fs.async_impl:
248
+ return await fs._pipe_file(path, value, **kwargs)
249
+ else:
250
+ return fs.pipe_file(path, value, **kwargs)
251
+
252
+ async def _rm(self, url, **kwargs):
253
+ urls = url
254
+ if isinstance(urls, str):
255
+ urls = [urls]
256
+ fs = _resolve_fs(urls[0], self.method)
257
+ if fs.async_impl:
258
+ await fs._rm(urls, **kwargs)
259
+ else:
260
+ fs.rm(url, **kwargs)
261
+
262
+ async def _makedirs(self, path, exist_ok=False):
263
+ logger.debug("Make dir %s", path)
264
+ fs = _resolve_fs(path, self.method)
265
+ if fs.async_impl:
266
+ await fs._makedirs(path, exist_ok=exist_ok)
267
+ else:
268
+ fs.makedirs(path, exist_ok=exist_ok)
269
+
270
+ def rsync(self, source, destination, **kwargs):
271
+ """Sync files between two directory trees
272
+
273
+ See `func:rsync` for more details.
274
+ """
275
+ rsync(source, destination, fs=self, **kwargs)
276
+
277
+ async def _cp_file(
278
+ self,
279
+ url,
280
+ url2,
281
+ blocksize=2**20,
282
+ callback=_DEFAULT_CALLBACK,
283
+ **kwargs,
284
+ ):
285
+ fs = _resolve_fs(url, self.method)
286
+ fs2 = _resolve_fs(url2, self.method)
287
+ if fs is fs2:
288
+ # pure remote
289
+ if fs.async_impl:
290
+ return await fs._cp_file(url, url2, **kwargs)
291
+ else:
292
+ return fs.cp_file(url, url2, **kwargs)
293
+ kw = {"blocksize": 0, "cache_type": "none"}
294
+ try:
295
+ f1 = (
296
+ await fs.open_async(url, "rb")
297
+ if hasattr(fs, "open_async")
298
+ else fs.open(url, "rb", **kw)
299
+ )
300
+ callback.set_size(await maybe_await(f1.size))
301
+ f2 = (
302
+ await fs2.open_async(url2, "wb")
303
+ if hasattr(fs2, "open_async")
304
+ else fs2.open(url2, "wb", **kw)
305
+ )
306
+ while f1.size is None or f2.tell() < f1.size:
307
+ data = await maybe_await(f1.read(blocksize))
308
+ if f1.size is None and not data:
309
+ break
310
+ await maybe_await(f2.write(data))
311
+ callback.absolute_update(f2.tell())
312
+ finally:
313
+ try:
314
+ await maybe_await(f2.close())
315
+ await maybe_await(f1.close())
316
+ except NameError:
317
+ # fail while opening f1 or f2
318
+ pass
319
+
320
+ async def _make_many_dirs(self, urls, exist_ok=True):
321
+ fs = _resolve_fs(urls[0], self.method)
322
+ if fs.async_impl:
323
+ coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
324
+ await _run_coros_in_chunks(coros)
325
+ else:
326
+ for u in urls:
327
+ fs.makedirs(u, exist_ok=exist_ok)
328
+
329
+ make_many_dirs = sync_wrapper(_make_many_dirs)
330
+
331
+ async def _copy(
332
+ self,
333
+ path1: list[str],
334
+ path2: list[str],
335
+ recursive: bool = False,
336
+ on_error: str = "ignore",
337
+ maxdepth: Optional[int] = None,
338
+ batch_size: Optional[int] = None,
339
+ tempdir: Optional[str] = None,
340
+ **kwargs,
341
+ ):
342
+ if recursive:
343
+ raise NotImplementedError
344
+ fs = _resolve_fs(path1[0], self.method)
345
+ fs2 = _resolve_fs(path2[0], self.method)
346
+ # not expanding paths atm., assume call is from rsync()
347
+ if fs is fs2:
348
+ # pure remote
349
+ if fs.async_impl:
350
+ return await fs._copy(path1, path2, **kwargs)
351
+ else:
352
+ return fs.copy(path1, path2, **kwargs)
353
+ await copy_file_op(
354
+ fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
355
+ )
356
+
357
+
358
+ async def copy_file_op(
359
+ fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
360
+ ):
361
+ import tempfile
362
+
363
+ tempdir = tempdir or tempfile.mkdtemp()
364
+ try:
365
+ coros = [
366
+ _copy_file_op(
367
+ fs1,
368
+ u1,
369
+ fs2,
370
+ u2,
371
+ os.path.join(tempdir, uuid.uuid4().hex),
372
+ on_error=on_error,
373
+ )
374
+ for u1, u2 in zip(url1, url2)
375
+ ]
376
+ await _run_coros_in_chunks(coros, batch_size=batch_size)
377
+ finally:
378
+ shutil.rmtree(tempdir)
379
+
380
+
381
+ async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
382
+ ex = () if on_error == "raise" else Exception
383
+ logger.debug("Copy %s -> %s", url1, url2)
384
+ try:
385
+ if fs1.async_impl:
386
+ await fs1._get_file(url1, local)
387
+ else:
388
+ fs1.get_file(url1, local)
389
+ if fs2.async_impl:
390
+ await fs2._put_file(local, url2)
391
+ else:
392
+ fs2.put_file(local, url2)
393
+ os.unlink(local)
394
+ logger.debug("Copy %s -> %s; done", url1, url2)
395
+ except ex as e:
396
+ logger.debug("ignoring cp exception for %s: %s", url1, e)
397
+
398
+
399
+ async def maybe_await(cor):
400
+ if inspect.iscoroutine(cor):
401
+ return await cor
402
+ else:
403
+ return cor
lib/python3.11/site-packages/fsspec/gui.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import contextlib
3
+ import logging
4
+ import os
5
+ import re
6
+ from typing import ClassVar, Sequence
7
+
8
+ import panel as pn
9
+
10
+ from .core import OpenFile, get_filesystem_class, split_protocol
11
+ from .registry import known_implementations
12
+
13
+ pn.extension()
14
+ logger = logging.getLogger("fsspec.gui")
15
+
16
+
17
+ class SigSlot:
18
+ """Signal-slot mixin, for Panel event passing
19
+
20
+ Include this class in a widget manager's superclasses to be able to
21
+ register events and callbacks on Panel widgets managed by that class.
22
+
23
+ The method ``_register`` should be called as widgets are added, and external
24
+ code should call ``connect`` to associate callbacks.
25
+
26
+ By default, all signals emit a DEBUG logging statement.
27
+ """
28
+
29
+ # names of signals that this class may emit each of which must be
30
+ # set by _register for any new instance
31
+ signals: ClassVar[Sequence[str]] = []
32
+ # names of actions that this class may respond to
33
+ slots: ClassVar[Sequence[str]] = []
34
+
35
+ # each of which must be a method name
36
+
37
+ def __init__(self):
38
+ self._ignoring_events = False
39
+ self._sigs = {}
40
+ self._map = {}
41
+ self._setup()
42
+
43
+ def _setup(self):
44
+ """Create GUI elements and register signals"""
45
+ self.panel = pn.pane.PaneBase()
46
+ # no signals to set up in the base class
47
+
48
+ def _register(
49
+ self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
50
+ ):
51
+ """Watch the given attribute of a widget and assign it a named event
52
+
53
+ This is normally called at the time a widget is instantiated, in the
54
+ class which owns it.
55
+
56
+ Parameters
57
+ ----------
58
+ widget : pn.layout.Panel or None
59
+ Widget to watch. If None, an anonymous signal not associated with
60
+ any widget.
61
+ name : str
62
+ Name of this event
63
+ thing : str
64
+ Attribute of the given widget to watch
65
+ log_level : int
66
+ When the signal is triggered, a logging event of the given level
67
+ will be fired in the dfviz logger.
68
+ auto : bool
69
+ If True, automatically connects with a method in this class of the
70
+ same name.
71
+ """
72
+ if name not in self.signals:
73
+ raise ValueError(f"Attempt to assign an undeclared signal: {name}")
74
+ self._sigs[name] = {
75
+ "widget": widget,
76
+ "callbacks": [],
77
+ "thing": thing,
78
+ "log": log_level,
79
+ }
80
+ wn = "-".join(
81
+ [
82
+ getattr(widget, "name", str(widget)) if widget is not None else "none",
83
+ thing,
84
+ ]
85
+ )
86
+ self._map[wn] = name
87
+ if widget is not None:
88
+ widget.param.watch(self._signal, thing, onlychanged=True)
89
+ if auto and hasattr(self, name):
90
+ self.connect(name, getattr(self, name))
91
+
92
+ def _repr_mimebundle_(self, *args, **kwargs):
93
+ """Display in a notebook or a server"""
94
+ try:
95
+ return self.panel._repr_mimebundle_(*args, **kwargs)
96
+ except (ValueError, AttributeError):
97
+ raise NotImplementedError("Panel does not seem to be set " "up properly")
98
+
99
+ def connect(self, signal, slot):
100
+ """Associate call back with given event
101
+
102
+ The callback must be a function which takes the "new" value of the
103
+ watched attribute as the only parameter. If the callback return False,
104
+ this cancels any further processing of the given event.
105
+
106
+ Alternatively, the callback can be a string, in which case it means
107
+ emitting the correspondingly-named event (i.e., connect to self)
108
+ """
109
+ self._sigs[signal]["callbacks"].append(slot)
110
+
111
+ def _signal(self, event):
112
+ """This is called by a an action on a widget
113
+
114
+ Within an self.ignore_events context, nothing happens.
115
+
116
+ Tests can execute this method by directly changing the values of
117
+ widget components.
118
+ """
119
+ if not self._ignoring_events:
120
+ wn = "-".join([event.obj.name, event.name])
121
+ if wn in self._map and self._map[wn] in self._sigs:
122
+ self._emit(self._map[wn], event.new)
123
+
124
+ @contextlib.contextmanager
125
+ def ignore_events(self):
126
+ """Temporarily turn off events processing in this instance
127
+
128
+ (does not propagate to children)
129
+ """
130
+ self._ignoring_events = True
131
+ try:
132
+ yield
133
+ finally:
134
+ self._ignoring_events = False
135
+
136
+ def _emit(self, sig, value=None):
137
+ """An event happened, call its callbacks
138
+
139
+ This method can be used in tests to simulate message passing without
140
+ directly changing visual elements.
141
+
142
+ Calling of callbacks will halt whenever one returns False.
143
+ """
144
+ logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
145
+ for callback in self._sigs[sig]["callbacks"]:
146
+ if isinstance(callback, str):
147
+ self._emit(callback)
148
+ else:
149
+ try:
150
+ # running callbacks should not break the interface
151
+ ret = callback(value)
152
+ if ret is False:
153
+ break
154
+ except Exception as e:
155
+ logger.exception(
156
+ "Exception (%s) while executing callback for signal: %s"
157
+ "" % (e, sig)
158
+ )
159
+
160
+ def show(self, threads=False):
161
+ """Open a new browser tab and display this instance's interface"""
162
+ self.panel.show(threads=threads, verbose=False)
163
+ return self
164
+
165
+
166
+ class SingleSelect(SigSlot):
167
+ """A multiselect which only allows you to select one item for an event"""
168
+
169
+ signals = ["_selected", "selected"] # the first is internal
170
+ slots = ["set_options", "set_selection", "add", "clear", "select"]
171
+
172
+ def __init__(self, **kwargs):
173
+ self.kwargs = kwargs
174
+ super().__init__()
175
+
176
+ def _setup(self):
177
+ self.panel = pn.widgets.MultiSelect(**self.kwargs)
178
+ self._register(self.panel, "_selected", "value")
179
+ self._register(None, "selected")
180
+ self.connect("_selected", self.select_one)
181
+
182
+ def _signal(self, *args, **kwargs):
183
+ super()._signal(*args, **kwargs)
184
+
185
+ def select_one(self, *_):
186
+ with self.ignore_events():
187
+ val = [self.panel.value[-1]] if self.panel.value else []
188
+ self.panel.value = val
189
+ self._emit("selected", self.panel.value)
190
+
191
+ def set_options(self, options):
192
+ self.panel.options = options
193
+
194
+ def clear(self):
195
+ self.panel.options = []
196
+
197
+ @property
198
+ def value(self):
199
+ return self.panel.value
200
+
201
+ def set_selection(self, selection):
202
+ self.panel.value = [selection]
203
+
204
+
205
+ class FileSelector(SigSlot):
206
+ """Panel-based graphical file selector widget
207
+
208
+ Instances of this widget are interactive and can be displayed in jupyter by having
209
+ them as the output of a cell, or in a separate browser tab using ``.show()``.
210
+ """
211
+
212
+ signals = [
213
+ "protocol_changed",
214
+ "selection_changed",
215
+ "directory_entered",
216
+ "home_clicked",
217
+ "up_clicked",
218
+ "go_clicked",
219
+ "filters_changed",
220
+ ]
221
+ slots = ["set_filters", "go_home"]
222
+
223
+ def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
224
+ """
225
+
226
+ Parameters
227
+ ----------
228
+ url : str (optional)
229
+ Initial value of the URL to populate the dialog; should include protocol
230
+ filters : list(str) (optional)
231
+ File endings to include in the listings. If not included, all files are
232
+ allowed. Does not affect directories.
233
+ If given, the endings will appear as checkboxes in the interface
234
+ ignore : list(str) (optional)
235
+ Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
236
+ hidden files on posix
237
+ kwargs : dict (optional)
238
+ To pass to file system instance
239
+ """
240
+ if url:
241
+ self.init_protocol, url = split_protocol(url)
242
+ else:
243
+ self.init_protocol, url = "file", os.getcwd()
244
+ self.init_url = url
245
+ self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
246
+ self.filters = filters
247
+ self.ignore = [re.compile(i) for i in ignore or []]
248
+ self._fs = None
249
+ super().__init__()
250
+
251
+ def _setup(self):
252
+ self.url = pn.widgets.TextInput(
253
+ name="url",
254
+ value=self.init_url,
255
+ align="end",
256
+ sizing_mode="stretch_width",
257
+ width_policy="max",
258
+ )
259
+ self.protocol = pn.widgets.Select(
260
+ options=sorted(known_implementations),
261
+ value=self.init_protocol,
262
+ name="protocol",
263
+ align="center",
264
+ )
265
+ self.kwargs = pn.widgets.TextInput(
266
+ name="kwargs", value=self.init_kwargs, align="center"
267
+ )
268
+ self.go = pn.widgets.Button(name="⇨", align="end", width=45)
269
+ self.main = SingleSelect(size=10)
270
+ self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
271
+ self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
272
+
273
+ self._register(self.protocol, "protocol_changed", auto=True)
274
+ self._register(self.go, "go_clicked", "clicks", auto=True)
275
+ self._register(self.up, "up_clicked", "clicks", auto=True)
276
+ self._register(self.home, "home_clicked", "clicks", auto=True)
277
+ self._register(None, "selection_changed")
278
+ self.main.connect("selected", self.selection_changed)
279
+ self._register(None, "directory_entered")
280
+ self.prev_protocol = self.protocol.value
281
+ self.prev_kwargs = self.storage_options
282
+
283
+ self.filter_sel = pn.widgets.CheckBoxGroup(
284
+ value=[], options=[], inline=False, align="end", width_policy="min"
285
+ )
286
+ self._register(self.filter_sel, "filters_changed", auto=True)
287
+
288
+ self.panel = pn.Column(
289
+ pn.Row(self.protocol, self.kwargs),
290
+ pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
291
+ self.main.panel,
292
+ )
293
+ self.set_filters(self.filters)
294
+ self.go_clicked()
295
+
296
+ def set_filters(self, filters=None):
297
+ self.filters = filters
298
+ if filters:
299
+ self.filter_sel.options = filters
300
+ self.filter_sel.value = filters
301
+ else:
302
+ self.filter_sel.options = []
303
+ self.filter_sel.value = []
304
+
305
+ @property
306
+ def storage_options(self):
307
+ """Value of the kwargs box as a dictionary"""
308
+ return ast.literal_eval(self.kwargs.value) or {}
309
+
310
+ @property
311
+ def fs(self):
312
+ """Current filesystem instance"""
313
+ if self._fs is None:
314
+ cls = get_filesystem_class(self.protocol.value)
315
+ self._fs = cls(**self.storage_options)
316
+ return self._fs
317
+
318
+ @property
319
+ def urlpath(self):
320
+ """URL of currently selected item"""
321
+ return (
322
+ (f"{self.protocol.value}://{self.main.value[0]}")
323
+ if self.main.value
324
+ else None
325
+ )
326
+
327
+ def open_file(self, mode="rb", compression=None, encoding=None):
328
+ """Create OpenFile instance for the currently selected item
329
+
330
+ For example, in a notebook you might do something like
331
+
332
+ .. code-block::
333
+
334
+ [ ]: sel = FileSelector(); sel
335
+
336
+ # user selects their file
337
+
338
+ [ ]: with sel.open_file('rb') as f:
339
+ ... out = f.read()
340
+
341
+ Parameters
342
+ ----------
343
+ mode: str (optional)
344
+ Open mode for the file.
345
+ compression: str (optional)
346
+ The interact with the file as compressed. Set to 'infer' to guess
347
+ compression from the file ending
348
+ encoding: str (optional)
349
+ If using text mode, use this encoding; defaults to UTF8.
350
+ """
351
+ if self.urlpath is None:
352
+ raise ValueError("No file selected")
353
+ return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
354
+
355
+ def filters_changed(self, values):
356
+ self.filters = values
357
+ self.go_clicked()
358
+
359
+ def selection_changed(self, *_):
360
+ if self.urlpath is None:
361
+ return
362
+ if self.fs.isdir(self.urlpath):
363
+ self.url.value = self.fs._strip_protocol(self.urlpath)
364
+ self.go_clicked()
365
+
366
+ def go_clicked(self, *_):
367
+ if (
368
+ self.prev_protocol != self.protocol.value
369
+ or self.prev_kwargs != self.storage_options
370
+ ):
371
+ self._fs = None # causes fs to be recreated
372
+ self.prev_protocol = self.protocol.value
373
+ self.prev_kwargs = self.storage_options
374
+ listing = sorted(
375
+ self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
376
+ )
377
+ listing = [
378
+ l
379
+ for l in listing
380
+ if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
381
+ ]
382
+ folders = {
383
+ "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
384
+ for o in listing
385
+ if o["type"] == "directory"
386
+ }
387
+ files = {
388
+ "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
389
+ for o in listing
390
+ if o["type"] == "file"
391
+ }
392
+ if self.filters:
393
+ files = {
394
+ k: v
395
+ for k, v in files.items()
396
+ if any(v.endswith(ext) for ext in self.filters)
397
+ }
398
+ self.main.set_options(dict(**folders, **files))
399
+
400
+ def protocol_changed(self, *_):
401
+ self._fs = None
402
+ self.main.options = []
403
+ self.url.value = ""
404
+
405
+ def home_clicked(self, *_):
406
+ self.protocol.value = self.init_protocol
407
+ self.kwargs.value = self.init_kwargs
408
+ self.url.value = self.init_url
409
+ self.go_clicked()
410
+
411
+ def up_clicked(self, *_):
412
+ self.url.value = self.fs._parent(self.url.value)
413
+ self.go_clicked()
lib/python3.11/site-packages/fsspec/implementations/__init__.py ADDED
File without changes
lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (237 Bytes). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc ADDED
Binary file (15.2 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc ADDED
Binary file (4.84 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc ADDED
Binary file (12.8 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc ADDED
Binary file (45.6 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc ADDED
Binary file (7.8 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc ADDED
Binary file (2.81 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc ADDED
Binary file (19.6 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc ADDED
Binary file (25.3 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc ADDED
Binary file (19.4 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc ADDED
Binary file (6.19 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc ADDED
Binary file (45.2 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc ADDED
Binary file (7.33 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc ADDED
Binary file (10.3 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc ADDED
Binary file (25.4 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc ADDED
Binary file (15.3 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc ADDED
Binary file (65.6 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc ADDED
Binary file (16.4 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc ADDED
Binary file (5.41 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc ADDED
Binary file (24.2 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc ADDED
Binary file (6.67 kB). View file
 
lib/python3.11/site-packages/fsspec/implementations/arrow.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import errno
2
+ import io
3
+ import os
4
+ import secrets
5
+ import shutil
6
+ from contextlib import suppress
7
+ from functools import cached_property, wraps
8
+
9
+ from fsspec.spec import AbstractFileSystem
10
+ from fsspec.utils import (
11
+ get_package_version_without_import,
12
+ infer_storage_options,
13
+ mirror_from,
14
+ tokenize,
15
+ )
16
+
17
+
18
+ def wrap_exceptions(func):
19
+ @wraps(func)
20
+ def wrapper(*args, **kwargs):
21
+ try:
22
+ return func(*args, **kwargs)
23
+ except OSError as exception:
24
+ if not exception.args:
25
+ raise
26
+
27
+ message, *args = exception.args
28
+ if isinstance(message, str) and "does not exist" in message:
29
+ raise FileNotFoundError(errno.ENOENT, message) from exception
30
+ else:
31
+ raise
32
+
33
+ return wrapper
34
+
35
+
36
+ PYARROW_VERSION = None
37
+
38
+
39
+ class ArrowFSWrapper(AbstractFileSystem):
40
+ """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
41
+
42
+ Parameters
43
+ ----------
44
+ fs : pyarrow.fs.FileSystem
45
+
46
+ """
47
+
48
+ root_marker = "/"
49
+
50
+ def __init__(self, fs, **kwargs):
51
+ global PYARROW_VERSION
52
+ PYARROW_VERSION = get_package_version_without_import("pyarrow")
53
+ self.fs = fs
54
+ super().__init__(**kwargs)
55
+
56
+ @property
57
+ def protocol(self):
58
+ return self.fs.type_name
59
+
60
+ @cached_property
61
+ def fsid(self):
62
+ return "hdfs_" + tokenize(self.fs.host, self.fs.port)
63
+
64
+ @classmethod
65
+ def _strip_protocol(cls, path):
66
+ ops = infer_storage_options(path)
67
+ path = ops["path"]
68
+ if path.startswith("//"):
69
+ # special case for "hdfs://path" (without the triple slash)
70
+ path = path[1:]
71
+ return path
72
+
73
+ def ls(self, path, detail=False, **kwargs):
74
+ path = self._strip_protocol(path)
75
+ from pyarrow.fs import FileSelector
76
+
77
+ entries = [
78
+ self._make_entry(entry)
79
+ for entry in self.fs.get_file_info(FileSelector(path))
80
+ ]
81
+ if detail:
82
+ return entries
83
+ else:
84
+ return [entry["name"] for entry in entries]
85
+
86
+ def info(self, path, **kwargs):
87
+ path = self._strip_protocol(path)
88
+ [info] = self.fs.get_file_info([path])
89
+ return self._make_entry(info)
90
+
91
+ def exists(self, path):
92
+ path = self._strip_protocol(path)
93
+ try:
94
+ self.info(path)
95
+ except FileNotFoundError:
96
+ return False
97
+ else:
98
+ return True
99
+
100
+ def _make_entry(self, info):
101
+ from pyarrow.fs import FileType
102
+
103
+ if info.type is FileType.Directory:
104
+ kind = "directory"
105
+ elif info.type is FileType.File:
106
+ kind = "file"
107
+ elif info.type is FileType.NotFound:
108
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
109
+ else:
110
+ kind = "other"
111
+
112
+ return {
113
+ "name": info.path,
114
+ "size": info.size,
115
+ "type": kind,
116
+ "mtime": info.mtime,
117
+ }
118
+
119
+ @wrap_exceptions
120
+ def cp_file(self, path1, path2, **kwargs):
121
+ path1 = self._strip_protocol(path1).rstrip("/")
122
+ path2 = self._strip_protocol(path2).rstrip("/")
123
+
124
+ with self._open(path1, "rb") as lstream:
125
+ tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
126
+ try:
127
+ with self.open(tmp_fname, "wb") as rstream:
128
+ shutil.copyfileobj(lstream, rstream)
129
+ self.fs.move(tmp_fname, path2)
130
+ except BaseException: # noqa
131
+ with suppress(FileNotFoundError):
132
+ self.fs.delete_file(tmp_fname)
133
+ raise
134
+
135
+ @wrap_exceptions
136
+ def mv(self, path1, path2, **kwargs):
137
+ path1 = self._strip_protocol(path1).rstrip("/")
138
+ path2 = self._strip_protocol(path2).rstrip("/")
139
+ self.fs.move(path1, path2)
140
+
141
+ mv_file = mv
142
+
143
+ @wrap_exceptions
144
+ def rm_file(self, path):
145
+ path = self._strip_protocol(path)
146
+ self.fs.delete_file(path)
147
+
148
+ @wrap_exceptions
149
+ def rm(self, path, recursive=False, maxdepth=None):
150
+ path = self._strip_protocol(path).rstrip("/")
151
+ if self.isdir(path):
152
+ if recursive:
153
+ self.fs.delete_dir(path)
154
+ else:
155
+ raise ValueError("Can't delete directories without recursive=False")
156
+ else:
157
+ self.fs.delete_file(path)
158
+
159
+ @wrap_exceptions
160
+ def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
161
+ if mode == "rb":
162
+ if seekable:
163
+ method = self.fs.open_input_file
164
+ else:
165
+ method = self.fs.open_input_stream
166
+ elif mode == "wb":
167
+ method = self.fs.open_output_stream
168
+ elif mode == "ab":
169
+ method = self.fs.open_append_stream
170
+ else:
171
+ raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
172
+
173
+ _kwargs = {}
174
+ if mode != "rb" or not seekable:
175
+ if int(PYARROW_VERSION.split(".")[0]) >= 4:
176
+ # disable compression auto-detection
177
+ _kwargs["compression"] = None
178
+ stream = method(path, **_kwargs)
179
+
180
+ return ArrowFile(self, stream, path, mode, block_size, **kwargs)
181
+
182
+ @wrap_exceptions
183
+ def mkdir(self, path, create_parents=True, **kwargs):
184
+ path = self._strip_protocol(path)
185
+ if create_parents:
186
+ self.makedirs(path, exist_ok=True)
187
+ else:
188
+ self.fs.create_dir(path, recursive=False)
189
+
190
+ @wrap_exceptions
191
+ def makedirs(self, path, exist_ok=False):
192
+ path = self._strip_protocol(path)
193
+ self.fs.create_dir(path, recursive=True)
194
+
195
+ @wrap_exceptions
196
+ def rmdir(self, path):
197
+ path = self._strip_protocol(path)
198
+ self.fs.delete_dir(path)
199
+
200
+ @wrap_exceptions
201
+ def modified(self, path):
202
+ path = self._strip_protocol(path)
203
+ return self.fs.get_file_info(path).mtime
204
+
205
+ def cat_file(self, path, start=None, end=None, **kwargs):
206
+ kwargs["seekable"] = start not in [None, 0]
207
+ return super().cat_file(path, start=None, end=None, **kwargs)
208
+
209
+ def get_file(self, rpath, lpath, **kwargs):
210
+ kwargs["seekable"] = False
211
+ super().get_file(rpath, lpath, **kwargs)
212
+
213
+
214
+ @mirror_from(
215
+ "stream",
216
+ [
217
+ "read",
218
+ "seek",
219
+ "tell",
220
+ "write",
221
+ "readable",
222
+ "writable",
223
+ "close",
224
+ "size",
225
+ "seekable",
226
+ ],
227
+ )
228
+ class ArrowFile(io.IOBase):
229
+ def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
230
+ self.path = path
231
+ self.mode = mode
232
+
233
+ self.fs = fs
234
+ self.stream = stream
235
+
236
+ self.blocksize = self.block_size = block_size
237
+ self.kwargs = kwargs
238
+
239
+ def __enter__(self):
240
+ return self
241
+
242
+ def __exit__(self, *args):
243
+ return self.close()
244
+
245
+
246
+ class HadoopFileSystem(ArrowFSWrapper):
247
+ """A wrapper on top of the pyarrow.fs.HadoopFileSystem
248
+ to connect it's interface with fsspec"""
249
+
250
+ protocol = "hdfs"
251
+
252
+ def __init__(
253
+ self,
254
+ host="default",
255
+ port=0,
256
+ user=None,
257
+ kerb_ticket=None,
258
+ extra_conf=None,
259
+ **kwargs,
260
+ ):
261
+ """
262
+
263
+ Parameters
264
+ ----------
265
+ host: str
266
+ Hostname, IP or "default" to try to read from Hadoop config
267
+ port: int
268
+ Port to connect on, or default from Hadoop config if 0
269
+ user: str or None
270
+ If given, connect as this username
271
+ kerb_ticket: str or None
272
+ If given, use this ticket for authentication
273
+ extra_conf: None or dict
274
+ Passed on to HadoopFileSystem
275
+ """
276
+ from pyarrow.fs import HadoopFileSystem
277
+
278
+ fs = HadoopFileSystem(
279
+ host=host,
280
+ port=port,
281
+ user=user,
282
+ kerb_ticket=kerb_ticket,
283
+ extra_conf=extra_conf,
284
+ )
285
+ super().__init__(fs=fs, **kwargs)
286
+
287
+ @staticmethod
288
+ def _get_kwargs_from_urls(path):
289
+ ops = infer_storage_options(path)
290
+ out = {}
291
+ if ops.get("host", None):
292
+ out["host"] = ops["host"]
293
+ if ops.get("username", None):
294
+ out["user"] = ops["username"]
295
+ if ops.get("port", None):
296
+ out["port"] = ops["port"]
297
+ return out
lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import hashlib
5
+ from typing import TYPE_CHECKING
6
+
7
+ from fsspec.implementations.local import make_path_posix
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import Any
11
+
12
+
13
+ class AbstractCacheMapper(abc.ABC):
14
+ """Abstract super-class for mappers from remote URLs to local cached
15
+ basenames.
16
+ """
17
+
18
+ @abc.abstractmethod
19
+ def __call__(self, path: str) -> str:
20
+ ...
21
+
22
+ def __eq__(self, other: Any) -> bool:
23
+ # Identity only depends on class. When derived classes have attributes
24
+ # they will need to be included.
25
+ return isinstance(other, type(self))
26
+
27
+ def __hash__(self) -> int:
28
+ # Identity only depends on class. When derived classes have attributes
29
+ # they will need to be included.
30
+ return hash(type(self))
31
+
32
+
33
+ class BasenameCacheMapper(AbstractCacheMapper):
34
+ """Cache mapper that uses the basename of the remote URL and a fixed number
35
+ of directory levels above this.
36
+
37
+ The default is zero directory levels, meaning different paths with the same
38
+ basename will have the same cached basename.
39
+ """
40
+
41
+ def __init__(self, directory_levels: int = 0):
42
+ if directory_levels < 0:
43
+ raise ValueError(
44
+ "BasenameCacheMapper requires zero or positive directory_levels"
45
+ )
46
+ self.directory_levels = directory_levels
47
+
48
+ # Separator for directories when encoded as strings.
49
+ self._separator = "_@_"
50
+
51
+ def __call__(self, path: str) -> str:
52
+ path = make_path_posix(path)
53
+ prefix, *bits = path.rsplit("/", self.directory_levels + 1)
54
+ if bits:
55
+ return self._separator.join(bits)
56
+ else:
57
+ return prefix # No separator found, simple filename
58
+
59
+ def __eq__(self, other: Any) -> bool:
60
+ return super().__eq__(other) and self.directory_levels == other.directory_levels
61
+
62
+ def __hash__(self) -> int:
63
+ return super().__hash__() ^ hash(self.directory_levels)
64
+
65
+
66
+ class HashCacheMapper(AbstractCacheMapper):
67
+ """Cache mapper that uses a hash of the remote URL."""
68
+
69
+ def __call__(self, path: str) -> str:
70
+ return hashlib.sha256(path.encode()).hexdigest()
71
+
72
+
73
+ def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
74
+ """Factory method to create cache mapper for backward compatibility with
75
+ ``CachingFileSystem`` constructor using ``same_names`` kwarg.
76
+ """
77
+ if same_names:
78
+ return BasenameCacheMapper()
79
+ else:
80
+ return HashCacheMapper()
lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+ import time
6
+ from typing import TYPE_CHECKING
7
+
8
+ from fsspec.utils import atomic_write
9
+
10
+ try:
11
+ import ujson as json
12
+ except ImportError:
13
+ if not TYPE_CHECKING:
14
+ import json
15
+
16
+ if TYPE_CHECKING:
17
+ from typing import Any, Dict, Iterator, Literal
18
+
19
+ from typing_extensions import TypeAlias
20
+
21
+ from .cached import CachingFileSystem
22
+
23
+ Detail: TypeAlias = Dict[str, Any]
24
+
25
+
26
+ class CacheMetadata:
27
+ """Cache metadata.
28
+
29
+ All reading and writing of cache metadata is performed by this class,
30
+ accessing the cached files and blocks is not.
31
+
32
+ Metadata is stored in a single file per storage directory in JSON format.
33
+ For backward compatibility, also reads metadata stored in pickle format
34
+ which is converted to JSON when next saved.
35
+ """
36
+
37
+ def __init__(self, storage: list[str]):
38
+ """
39
+
40
+ Parameters
41
+ ----------
42
+ storage: list[str]
43
+ Directories containing cached files, must be at least one. Metadata
44
+ is stored in the last of these directories by convention.
45
+ """
46
+ if not storage:
47
+ raise ValueError("CacheMetadata expects at least one storage location")
48
+
49
+ self._storage = storage
50
+ self.cached_files: list[Detail] = [{}]
51
+
52
+ # Private attribute to force saving of metadata in pickle format rather than
53
+ # JSON for use in tests to confirm can read both pickle and JSON formats.
54
+ self._force_save_pickle = False
55
+
56
+ def _load(self, fn: str) -> Detail:
57
+ """Low-level function to load metadata from specific file"""
58
+ try:
59
+ with open(fn, "r") as f:
60
+ return json.load(f)
61
+ except ValueError:
62
+ with open(fn, "rb") as f:
63
+ return pickle.load(f)
64
+
65
+ def _save(self, metadata_to_save: Detail, fn: str) -> None:
66
+ """Low-level function to save metadata to specific file"""
67
+ if self._force_save_pickle:
68
+ with atomic_write(fn) as f:
69
+ pickle.dump(metadata_to_save, f)
70
+ else:
71
+ with atomic_write(fn, mode="w") as f:
72
+ json.dump(metadata_to_save, f)
73
+
74
+ def _scan_locations(
75
+ self, writable_only: bool = False
76
+ ) -> Iterator[tuple[str, str, bool]]:
77
+ """Yield locations (filenames) where metadata is stored, and whether
78
+ writable or not.
79
+
80
+ Parameters
81
+ ----------
82
+ writable: bool
83
+ Set to True to only yield writable locations.
84
+
85
+ Returns
86
+ -------
87
+ Yields (str, str, bool)
88
+ """
89
+ n = len(self._storage)
90
+ for i, storage in enumerate(self._storage):
91
+ writable = i == n - 1
92
+ if writable_only and not writable:
93
+ continue
94
+ yield os.path.join(storage, "cache"), storage, writable
95
+
96
+ def check_file(
97
+ self, path: str, cfs: CachingFileSystem | None
98
+ ) -> Literal[False] | tuple[Detail, str]:
99
+ """If path is in cache return its details, otherwise return ``False``.
100
+
101
+ If the optional CachingFileSystem is specified then it is used to
102
+ perform extra checks to reject possible matches, such as if they are
103
+ too old.
104
+ """
105
+ for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
106
+ if path not in cache:
107
+ continue
108
+ detail = cache[path].copy()
109
+
110
+ if cfs is not None:
111
+ if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
112
+ # Wrong file as determined by hash of file properties
113
+ continue
114
+ if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
115
+ # Cached file has expired
116
+ continue
117
+
118
+ fn = os.path.join(base, detail["fn"])
119
+ if os.path.exists(fn):
120
+ return detail, fn
121
+ return False
122
+
123
+ def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
124
+ """Remove expired metadata from the cache.
125
+
126
+ Returns names of files corresponding to expired metadata and a boolean
127
+ flag indicating whether the writable cache is empty. Caller is
128
+ responsible for deleting the expired files.
129
+ """
130
+ expired_files = []
131
+ for path, detail in self.cached_files[-1].copy().items():
132
+ if time.time() - detail["time"] > expiry_time:
133
+ fn = detail.get("fn", "")
134
+ if not fn:
135
+ raise RuntimeError(
136
+ f"Cache metadata does not contain 'fn' for {path}"
137
+ )
138
+ fn = os.path.join(self._storage[-1], fn)
139
+ expired_files.append(fn)
140
+ self.cached_files[-1].pop(path)
141
+
142
+ if self.cached_files[-1]:
143
+ cache_path = os.path.join(self._storage[-1], "cache")
144
+ self._save(self.cached_files[-1], cache_path)
145
+
146
+ writable_cache_empty = not self.cached_files[-1]
147
+ return expired_files, writable_cache_empty
148
+
149
+ def load(self) -> None:
150
+ """Load all metadata from disk and store in ``self.cached_files``"""
151
+ cached_files = []
152
+ for fn, _, _ in self._scan_locations():
153
+ if os.path.exists(fn):
154
+ # TODO: consolidate blocks here
155
+ loaded_cached_files = self._load(fn)
156
+ for c in loaded_cached_files.values():
157
+ if isinstance(c["blocks"], list):
158
+ c["blocks"] = set(c["blocks"])
159
+ cached_files.append(loaded_cached_files)
160
+ else:
161
+ cached_files.append({})
162
+ self.cached_files = cached_files or [{}]
163
+
164
+ def on_close_cached_file(self, f: Any, path: str) -> None:
165
+ """Perform side-effect actions on closing a cached file.
166
+
167
+ The actual closing of the file is the responsibility of the caller.
168
+ """
169
+ # File must be writeble, so in self.cached_files[-1]
170
+ c = self.cached_files[-1][path]
171
+ if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
172
+ c["blocks"] = True
173
+
174
+ def pop_file(self, path: str) -> str | None:
175
+ """Remove metadata of cached file.
176
+
177
+ If path is in the cache, return the filename of the cached file,
178
+ otherwise return ``None``. Caller is responsible for deleting the
179
+ cached file.
180
+ """
181
+ details = self.check_file(path, None)
182
+ if not details:
183
+ return None
184
+ _, fn = details
185
+ if fn.startswith(self._storage[-1]):
186
+ self.cached_files[-1].pop(path)
187
+ self.save()
188
+ else:
189
+ raise PermissionError(
190
+ "Can only delete cached file in last, writable cache location"
191
+ )
192
+ return fn
193
+
194
+ def save(self) -> None:
195
+ """Save metadata to disk"""
196
+ for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
197
+ if not writable:
198
+ continue
199
+
200
+ if os.path.exists(fn):
201
+ cached_files = self._load(fn)
202
+ for k, c in cached_files.items():
203
+ if k in cache:
204
+ if c["blocks"] is True or cache[k]["blocks"] is True:
205
+ c["blocks"] = True
206
+ else:
207
+ # self.cached_files[*][*]["blocks"] must continue to
208
+ # point to the same set object so that updates
209
+ # performed by MMapCache are propagated back to
210
+ # self.cached_files.
211
+ blocks = cache[k]["blocks"]
212
+ blocks.update(c["blocks"])
213
+ c["blocks"] = blocks
214
+ c["time"] = max(c["time"], cache[k]["time"])
215
+ c["uid"] = cache[k]["uid"]
216
+
217
+ # Files can be added to cache after it was written once
218
+ for k, c in cache.items():
219
+ if k not in cached_files:
220
+ cached_files[k] = c
221
+ else:
222
+ cached_files = cache
223
+ cache = {k: v.copy() for k, v in cached_files.items()}
224
+ for c in cache.values():
225
+ if isinstance(c["blocks"], set):
226
+ c["blocks"] = list(c["blocks"])
227
+ self._save(cache, fn)
228
+ self.cached_files[-1] = cached_files
229
+
230
+ def update_file(self, path: str, detail: Detail) -> None:
231
+ """Update metadata for specific file in memory, do not save"""
232
+ self.cached_files[-1][path] = detail
lib/python3.11/site-packages/fsspec/implementations/cached.py ADDED
@@ -0,0 +1,864 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import tempfile
7
+ import time
8
+ import weakref
9
+ from shutil import rmtree
10
+ from typing import TYPE_CHECKING, Any, Callable, ClassVar
11
+
12
+ from fsspec import AbstractFileSystem, filesystem
13
+ from fsspec.callbacks import _DEFAULT_CALLBACK
14
+ from fsspec.compression import compr
15
+ from fsspec.core import BaseCache, MMapCache
16
+ from fsspec.exceptions import BlocksizeMismatchError
17
+ from fsspec.implementations.cache_mapper import create_cache_mapper
18
+ from fsspec.implementations.cache_metadata import CacheMetadata
19
+ from fsspec.spec import AbstractBufferedFile
20
+ from fsspec.transaction import Transaction
21
+ from fsspec.utils import infer_compression
22
+
23
+ if TYPE_CHECKING:
24
+ from fsspec.implementations.cache_mapper import AbstractCacheMapper
25
+
26
+ logger = logging.getLogger("fsspec.cached")
27
+
28
+
29
+ class WriteCachedTransaction(Transaction):
30
+ def complete(self, commit=True):
31
+ rpaths = [f.path for f in self.files]
32
+ lpaths = [f.fn for f in self.files]
33
+ if commit:
34
+ self.fs.put(lpaths, rpaths)
35
+ # else remove?
36
+ self.fs._intrans = False
37
+
38
+
39
+ class CachingFileSystem(AbstractFileSystem):
40
+ """Locally caching filesystem, layer over any other FS
41
+
42
+ This class implements chunk-wise local storage of remote files, for quick
43
+ access after the initial download. The files are stored in a given
44
+ directory with hashes of URLs for the filenames. If no directory is given,
45
+ a temporary one is used, which should be cleaned up by the OS after the
46
+ process ends. The files themselves are sparse (as implemented in
47
+ :class:`~fsspec.caching.MMapCache`), so only the data which is accessed
48
+ takes up space.
49
+
50
+ Restrictions:
51
+
52
+ - the block-size must be the same for each access of a given file, unless
53
+ all blocks of the file have already been read
54
+ - caching can only be applied to file-systems which produce files
55
+ derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
56
+ allowed, for testing
57
+ """
58
+
59
+ protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
60
+
61
+ def __init__(
62
+ self,
63
+ target_protocol=None,
64
+ cache_storage="TMP",
65
+ cache_check=10,
66
+ check_files=False,
67
+ expiry_time=604800,
68
+ target_options=None,
69
+ fs=None,
70
+ same_names: bool | None = None,
71
+ compression=None,
72
+ cache_mapper: AbstractCacheMapper | None = None,
73
+ **kwargs,
74
+ ):
75
+ """
76
+
77
+ Parameters
78
+ ----------
79
+ target_protocol: str (optional)
80
+ Target filesystem protocol. Provide either this or ``fs``.
81
+ cache_storage: str or list(str)
82
+ Location to store files. If "TMP", this is a temporary directory,
83
+ and will be cleaned up by the OS when this process ends (or later).
84
+ If a list, each location will be tried in the order given, but
85
+ only the last will be considered writable.
86
+ cache_check: int
87
+ Number of seconds between reload of cache metadata
88
+ check_files: bool
89
+ Whether to explicitly see if the UID of the remote file matches
90
+ the stored one before using. Warning: some file systems such as
91
+ HTTP cannot reliably give a unique hash of the contents of some
92
+ path, so be sure to set this option to False.
93
+ expiry_time: int
94
+ The time in seconds after which a local copy is considered useless.
95
+ Set to falsy to prevent expiry. The default is equivalent to one
96
+ week.
97
+ target_options: dict or None
98
+ Passed to the instantiation of the FS, if fs is None.
99
+ fs: filesystem instance
100
+ The target filesystem to run against. Provide this or ``protocol``.
101
+ same_names: bool (optional)
102
+ By default, target URLs are hashed using a ``HashCacheMapper`` so
103
+ that files from different backends with the same basename do not
104
+ conflict. If this argument is ``true``, a ``BasenameCacheMapper``
105
+ is used instead. Other cache mapper options are available by using
106
+ the ``cache_mapper`` keyword argument. Only one of this and
107
+ ``cache_mapper`` should be specified.
108
+ compression: str (optional)
109
+ To decompress on download. Can be 'infer' (guess from the URL name),
110
+ one of the entries in ``fsspec.compression.compr``, or None for no
111
+ decompression.
112
+ cache_mapper: AbstractCacheMapper (optional)
113
+ The object use to map from original filenames to cached filenames.
114
+ Only one of this and ``same_names`` should be specified.
115
+ """
116
+ super().__init__(**kwargs)
117
+ if fs is None and target_protocol is None:
118
+ raise ValueError(
119
+ "Please provide filesystem instance(fs) or target_protocol"
120
+ )
121
+ if not (fs is None) ^ (target_protocol is None):
122
+ raise ValueError(
123
+ "Both filesystems (fs) and target_protocol may not be both given."
124
+ )
125
+ if cache_storage == "TMP":
126
+ tempdir = tempfile.mkdtemp()
127
+ storage = [tempdir]
128
+ weakref.finalize(self, self._remove_tempdir, tempdir)
129
+ else:
130
+ if isinstance(cache_storage, str):
131
+ storage = [cache_storage]
132
+ else:
133
+ storage = cache_storage
134
+ os.makedirs(storage[-1], exist_ok=True)
135
+ self.storage = storage
136
+ self.kwargs = target_options or {}
137
+ self.cache_check = cache_check
138
+ self.check_files = check_files
139
+ self.expiry = expiry_time
140
+ self.compression = compression
141
+
142
+ # Size of cache in bytes. If None then the size is unknown and will be
143
+ # recalculated the next time cache_size() is called. On writes to the
144
+ # cache this is reset to None.
145
+ self._cache_size = None
146
+
147
+ if same_names is not None and cache_mapper is not None:
148
+ raise ValueError(
149
+ "Cannot specify both same_names and cache_mapper in "
150
+ "CachingFileSystem.__init__"
151
+ )
152
+ if cache_mapper is not None:
153
+ self._mapper = cache_mapper
154
+ else:
155
+ self._mapper = create_cache_mapper(
156
+ same_names if same_names is not None else False
157
+ )
158
+
159
+ self.target_protocol = (
160
+ target_protocol
161
+ if isinstance(target_protocol, str)
162
+ else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
163
+ )
164
+ self._metadata = CacheMetadata(self.storage)
165
+ self.load_cache()
166
+ self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
167
+
168
+ def _strip_protocol(path):
169
+ # acts as a method, since each instance has a difference target
170
+ return self.fs._strip_protocol(type(self)._strip_protocol(path))
171
+
172
+ self._strip_protocol: Callable = _strip_protocol
173
+
174
+ @staticmethod
175
+ def _remove_tempdir(tempdir):
176
+ try:
177
+ rmtree(tempdir)
178
+ except Exception:
179
+ pass
180
+
181
+ def _mkcache(self):
182
+ os.makedirs(self.storage[-1], exist_ok=True)
183
+
184
+ def cache_size(self):
185
+ """Return size of cache in bytes.
186
+
187
+ If more than one cache directory is in use, only the size of the last
188
+ one (the writable cache directory) is returned.
189
+ """
190
+ if self._cache_size is None:
191
+ cache_dir = self.storage[-1]
192
+ self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
193
+ return self._cache_size
194
+
195
+ def load_cache(self):
196
+ """Read set of stored blocks from file"""
197
+ self._metadata.load()
198
+ self._mkcache()
199
+ self.last_cache = time.time()
200
+
201
+ def save_cache(self):
202
+ """Save set of stored blocks from file"""
203
+ self._mkcache()
204
+ self._metadata.save()
205
+ self.last_cache = time.time()
206
+ self._cache_size = None
207
+
208
+ def _check_cache(self):
209
+ """Reload caches if time elapsed or any disappeared"""
210
+ self._mkcache()
211
+ if not self.cache_check:
212
+ # explicitly told not to bother checking
213
+ return
214
+ timecond = time.time() - self.last_cache > self.cache_check
215
+ existcond = all(os.path.exists(storage) for storage in self.storage)
216
+ if timecond or not existcond:
217
+ self.load_cache()
218
+
219
+ def _check_file(self, path):
220
+ """Is path in cache and still valid"""
221
+ path = self._strip_protocol(path)
222
+ self._check_cache()
223
+ return self._metadata.check_file(path, self)
224
+
225
+ def clear_cache(self):
226
+ """Remove all files and metadata from the cache
227
+
228
+ In the case of multiple cache locations, this clears only the last one,
229
+ which is assumed to be the read/write one.
230
+ """
231
+ rmtree(self.storage[-1])
232
+ self.load_cache()
233
+ self._cache_size = None
234
+
235
+ def clear_expired_cache(self, expiry_time=None):
236
+ """Remove all expired files and metadata from the cache
237
+
238
+ In the case of multiple cache locations, this clears only the last one,
239
+ which is assumed to be the read/write one.
240
+
241
+ Parameters
242
+ ----------
243
+ expiry_time: int
244
+ The time in seconds after which a local copy is considered useless.
245
+ If not defined the default is equivalent to the attribute from the
246
+ file caching instantiation.
247
+ """
248
+
249
+ if not expiry_time:
250
+ expiry_time = self.expiry
251
+
252
+ self._check_cache()
253
+
254
+ expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
255
+ for fn in expired_files:
256
+ if os.path.exists(fn):
257
+ os.remove(fn)
258
+
259
+ if writable_cache_empty:
260
+ rmtree(self.storage[-1])
261
+ self.load_cache()
262
+
263
+ self._cache_size = None
264
+
265
+ def pop_from_cache(self, path):
266
+ """Remove cached version of given file
267
+
268
+ Deletes local copy of the given (remote) path. If it is found in a cache
269
+ location which is not the last, it is assumed to be read-only, and
270
+ raises PermissionError
271
+ """
272
+ path = self._strip_protocol(path)
273
+ fn = self._metadata.pop_file(path)
274
+ if fn is not None:
275
+ os.remove(fn)
276
+ self._cache_size = None
277
+
278
+ def _open(
279
+ self,
280
+ path,
281
+ mode="rb",
282
+ block_size=None,
283
+ autocommit=True,
284
+ cache_options=None,
285
+ **kwargs,
286
+ ):
287
+ """Wrap the target _open
288
+
289
+ If the whole file exists in the cache, just open it locally and
290
+ return that.
291
+
292
+ Otherwise, open the file on the target FS, and make it have a mmap
293
+ cache pointing to the location which we determine, in our cache.
294
+ The ``blocks`` instance is shared, so as the mmap cache instance
295
+ updates, so does the entry in our ``cached_files`` attribute.
296
+ We monkey-patch this file, so that when it closes, we call
297
+ ``close_and_update`` to save the state of the blocks.
298
+ """
299
+ path = self._strip_protocol(path)
300
+
301
+ path = self.fs._strip_protocol(path)
302
+ if "r" not in mode:
303
+ return self.fs._open(
304
+ path,
305
+ mode=mode,
306
+ block_size=block_size,
307
+ autocommit=autocommit,
308
+ cache_options=cache_options,
309
+ **kwargs,
310
+ )
311
+ detail = self._check_file(path)
312
+ if detail:
313
+ # file is in cache
314
+ detail, fn = detail
315
+ hash, blocks = detail["fn"], detail["blocks"]
316
+ if blocks is True:
317
+ # stored file is complete
318
+ logger.debug("Opening local copy of %s", path)
319
+ return open(fn, mode)
320
+ # TODO: action where partial file exists in read-only cache
321
+ logger.debug("Opening partially cached copy of %s", path)
322
+ else:
323
+ hash = self._mapper(path)
324
+ fn = os.path.join(self.storage[-1], hash)
325
+ blocks = set()
326
+ detail = {
327
+ "original": path,
328
+ "fn": hash,
329
+ "blocks": blocks,
330
+ "time": time.time(),
331
+ "uid": self.fs.ukey(path),
332
+ }
333
+ self._metadata.update_file(path, detail)
334
+ logger.debug("Creating local sparse file for %s", path)
335
+
336
+ # call target filesystems open
337
+ self._mkcache()
338
+ f = self.fs._open(
339
+ path,
340
+ mode=mode,
341
+ block_size=block_size,
342
+ autocommit=autocommit,
343
+ cache_options=cache_options,
344
+ cache_type="none",
345
+ **kwargs,
346
+ )
347
+ if self.compression:
348
+ comp = (
349
+ infer_compression(path)
350
+ if self.compression == "infer"
351
+ else self.compression
352
+ )
353
+ f = compr[comp](f, mode="rb")
354
+ if "blocksize" in detail:
355
+ if detail["blocksize"] != f.blocksize:
356
+ raise BlocksizeMismatchError(
357
+ f"Cached file must be reopened with same block"
358
+ f" size as original (old: {detail['blocksize']},"
359
+ f" new {f.blocksize})"
360
+ )
361
+ else:
362
+ detail["blocksize"] = f.blocksize
363
+ f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
364
+ close = f.close
365
+ f.close = lambda: self.close_and_update(f, close)
366
+ self.save_cache()
367
+ return f
368
+
369
+ def _parent(self, path):
370
+ return self.fs._parent(path)
371
+
372
+ def hash_name(self, path: str, *args: Any) -> str:
373
+ # Kept for backward compatibility with downstream libraries.
374
+ # Ignores extra arguments, previously same_name boolean.
375
+ return self._mapper(path)
376
+
377
+ def close_and_update(self, f, close):
378
+ """Called when a file is closing, so store the set of blocks"""
379
+ if f.closed:
380
+ return
381
+ path = self._strip_protocol(f.path)
382
+ self._metadata.on_close_cached_file(f, path)
383
+ try:
384
+ logger.debug("going to save")
385
+ self.save_cache()
386
+ logger.debug("saved")
387
+ except OSError:
388
+ logger.debug("Cache saving failed while closing file")
389
+ except NameError:
390
+ logger.debug("Cache save failed due to interpreter shutdown")
391
+ close()
392
+ f.closed = True
393
+
394
+ def __getattribute__(self, item):
395
+ if item in [
396
+ "load_cache",
397
+ "_open",
398
+ "save_cache",
399
+ "close_and_update",
400
+ "__init__",
401
+ "__getattribute__",
402
+ "__reduce__",
403
+ "_make_local_details",
404
+ "open",
405
+ "cat",
406
+ "cat_file",
407
+ "cat_ranges",
408
+ "get",
409
+ "read_block",
410
+ "tail",
411
+ "head",
412
+ "_check_file",
413
+ "_check_cache",
414
+ "_mkcache",
415
+ "clear_cache",
416
+ "clear_expired_cache",
417
+ "pop_from_cache",
418
+ "_mkcache",
419
+ "local_file",
420
+ "_paths_from_path",
421
+ "get_mapper",
422
+ "open_many",
423
+ "commit_many",
424
+ "hash_name",
425
+ "__hash__",
426
+ "__eq__",
427
+ "to_json",
428
+ "cache_size",
429
+ "pipe_file",
430
+ "pipe",
431
+ "start_transaction",
432
+ "end_transaction",
433
+ ]:
434
+ # all the methods defined in this class. Note `open` here, since
435
+ # it calls `_open`, but is actually in superclass
436
+ return lambda *args, **kw: getattr(type(self), item).__get__(self)(
437
+ *args, **kw
438
+ )
439
+ if item in ["__reduce_ex__"]:
440
+ raise AttributeError
441
+ if item in ["transaction"]:
442
+ # property
443
+ return type(self).transaction.__get__(self)
444
+ if item in ["_cache", "transaction_type"]:
445
+ # class attributes
446
+ return getattr(type(self), item)
447
+ if item == "__class__":
448
+ return type(self)
449
+ d = object.__getattribute__(self, "__dict__")
450
+ fs = d.get("fs", None) # fs is not immediately defined
451
+ if item in d:
452
+ return d[item]
453
+ elif fs is not None:
454
+ if item in fs.__dict__:
455
+ # attribute of instance
456
+ return fs.__dict__[item]
457
+ # attributed belonging to the target filesystem
458
+ cls = type(fs)
459
+ m = getattr(cls, item)
460
+ if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
461
+ not hasattr(m, "__self__") or m.__self__ is None
462
+ ):
463
+ # instance method
464
+ return m.__get__(fs, cls)
465
+ return m # class method or attribute
466
+ else:
467
+ # attributes of the superclass, while target is being set up
468
+ return super().__getattribute__(item)
469
+
470
+ def __eq__(self, other):
471
+ """Test for equality."""
472
+ if self is other:
473
+ return True
474
+ if not isinstance(other, type(self)):
475
+ return False
476
+ return (
477
+ self.storage == other.storage
478
+ and self.kwargs == other.kwargs
479
+ and self.cache_check == other.cache_check
480
+ and self.check_files == other.check_files
481
+ and self.expiry == other.expiry
482
+ and self.compression == other.compression
483
+ and self._mapper == other._mapper
484
+ and self.target_protocol == other.target_protocol
485
+ )
486
+
487
+ def __hash__(self):
488
+ """Calculate hash."""
489
+ return (
490
+ hash(tuple(self.storage))
491
+ ^ hash(str(self.kwargs))
492
+ ^ hash(self.cache_check)
493
+ ^ hash(self.check_files)
494
+ ^ hash(self.expiry)
495
+ ^ hash(self.compression)
496
+ ^ hash(self._mapper)
497
+ ^ hash(self.target_protocol)
498
+ )
499
+
500
+ def to_json(self):
501
+ """Calculate JSON representation.
502
+
503
+ Not implemented yet for CachingFileSystem.
504
+ """
505
+ raise NotImplementedError(
506
+ "CachingFileSystem JSON representation not implemented"
507
+ )
508
+
509
+
510
+ class WholeFileCacheFileSystem(CachingFileSystem):
511
+ """Caches whole remote files on first access
512
+
513
+ This class is intended as a layer over any other file system, and
514
+ will make a local copy of each file accessed, so that all subsequent
515
+ reads are local. This is similar to ``CachingFileSystem``, but without
516
+ the block-wise functionality and so can work even when sparse files
517
+ are not allowed. See its docstring for definition of the init
518
+ arguments.
519
+
520
+ The class still needs access to the remote store for listing files,
521
+ and may refresh cached files.
522
+ """
523
+
524
+ protocol = "filecache"
525
+ local_file = True
526
+
527
+ def open_many(self, open_files):
528
+ paths = [of.path for of in open_files]
529
+ if "r" in open_files.mode:
530
+ self._mkcache()
531
+ else:
532
+ return [
533
+ LocalTempFile(
534
+ self.fs,
535
+ path,
536
+ mode=open_files.mode,
537
+ fn=os.path.join(self.storage[-1], self._mapper(path)),
538
+ )
539
+ for path in paths
540
+ ]
541
+
542
+ if self.compression:
543
+ raise NotImplementedError
544
+ details = [self._check_file(sp) for sp in paths]
545
+ downpath = [p for p, d in zip(paths, details) if not d]
546
+ downfn0 = [
547
+ os.path.join(self.storage[-1], self._mapper(p))
548
+ for p, d in zip(paths, details)
549
+ ] # keep these path names for opening later
550
+ downfn = [fn for fn, d in zip(downfn0, details) if not d]
551
+ if downpath:
552
+ # skip if all files are already cached and up to date
553
+ self.fs.get(downpath, downfn)
554
+
555
+ # update metadata - only happens when downloads are successful
556
+ newdetail = [
557
+ {
558
+ "original": path,
559
+ "fn": self._mapper(path),
560
+ "blocks": True,
561
+ "time": time.time(),
562
+ "uid": self.fs.ukey(path),
563
+ }
564
+ for path in downpath
565
+ ]
566
+ for path, detail in zip(downpath, newdetail):
567
+ self._metadata.update_file(path, detail)
568
+ self.save_cache()
569
+
570
+ def firstpart(fn):
571
+ # helper to adapt both whole-file and simple-cache
572
+ return fn[1] if isinstance(fn, tuple) else fn
573
+
574
+ return [
575
+ open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
576
+ for fn0, fn1 in zip(details, downfn0)
577
+ ]
578
+
579
+ def commit_many(self, open_files):
580
+ self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
581
+ [f.close() for f in open_files]
582
+ for f in open_files:
583
+ # in case autocommit is off, and so close did not already delete
584
+ try:
585
+ os.remove(f.name)
586
+ except FileNotFoundError:
587
+ pass
588
+ self._cache_size = None
589
+
590
+ def _make_local_details(self, path):
591
+ hash = self._mapper(path)
592
+ fn = os.path.join(self.storage[-1], hash)
593
+ detail = {
594
+ "original": path,
595
+ "fn": hash,
596
+ "blocks": True,
597
+ "time": time.time(),
598
+ "uid": self.fs.ukey(path),
599
+ }
600
+ self._metadata.update_file(path, detail)
601
+ logger.debug("Copying %s to local cache", path)
602
+ return fn
603
+
604
+ def cat(
605
+ self,
606
+ path,
607
+ recursive=False,
608
+ on_error="raise",
609
+ callback=_DEFAULT_CALLBACK,
610
+ **kwargs,
611
+ ):
612
+ paths = self.expand_path(
613
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
614
+ )
615
+ getpaths = []
616
+ storepaths = []
617
+ fns = []
618
+ out = {}
619
+ for p in paths.copy():
620
+ try:
621
+ detail = self._check_file(p)
622
+ if not detail:
623
+ fn = self._make_local_details(p)
624
+ getpaths.append(p)
625
+ storepaths.append(fn)
626
+ else:
627
+ detail, fn = detail if isinstance(detail, tuple) else (None, detail)
628
+ fns.append(fn)
629
+ except Exception as e:
630
+ if on_error == "raise":
631
+ raise
632
+ if on_error == "return":
633
+ out[p] = e
634
+ paths.remove(p)
635
+
636
+ if getpaths:
637
+ self.fs.get(getpaths, storepaths)
638
+ self.save_cache()
639
+
640
+ callback.set_size(len(paths))
641
+ for p, fn in zip(paths, fns):
642
+ with open(fn, "rb") as f:
643
+ out[p] = f.read()
644
+ callback.relative_update(1)
645
+ if isinstance(path, str) and len(paths) == 1 and recursive is False:
646
+ out = out[paths[0]]
647
+ return out
648
+
649
+ def _open(self, path, mode="rb", **kwargs):
650
+ path = self._strip_protocol(path)
651
+ if "r" not in mode:
652
+ fn = self._make_local_details(path)
653
+ return LocalTempFile(self, path, mode=mode, fn=fn)
654
+ detail = self._check_file(path)
655
+ if detail:
656
+ detail, fn = detail
657
+ _, blocks = detail["fn"], detail["blocks"]
658
+ if blocks is True:
659
+ logger.debug("Opening local copy of %s", path)
660
+
661
+ # In order to support downstream filesystems to be able to
662
+ # infer the compression from the original filename, like
663
+ # the `TarFileSystem`, let's extend the `io.BufferedReader`
664
+ # fileobject protocol by adding a dedicated attribute
665
+ # `original`.
666
+ f = open(fn, mode)
667
+ f.original = detail.get("original")
668
+ return f
669
+ else:
670
+ raise ValueError(
671
+ f"Attempt to open partially cached file {path}"
672
+ f" as a wholly cached file"
673
+ )
674
+ else:
675
+ fn = self._make_local_details(path)
676
+ kwargs["mode"] = mode
677
+
678
+ # call target filesystems open
679
+ self._mkcache()
680
+ if self.compression:
681
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
682
+ if isinstance(f, AbstractBufferedFile):
683
+ # want no type of caching if just downloading whole thing
684
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
685
+ comp = (
686
+ infer_compression(path)
687
+ if self.compression == "infer"
688
+ else self.compression
689
+ )
690
+ f = compr[comp](f, mode="rb")
691
+ data = True
692
+ while data:
693
+ block = getattr(f, "blocksize", 5 * 2**20)
694
+ data = f.read(block)
695
+ f2.write(data)
696
+ else:
697
+ self.fs.get_file(path, fn)
698
+ self.save_cache()
699
+ return self._open(path, mode)
700
+
701
+
702
+ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
703
+ """Caches whole remote files on first access
704
+
705
+ This class is intended as a layer over any other file system, and
706
+ will make a local copy of each file accessed, so that all subsequent
707
+ reads are local. This implementation only copies whole files, and
708
+ does not keep any metadata about the download time or file details.
709
+ It is therefore safer to use in multi-threaded/concurrent situations.
710
+
711
+ This is the only of the caching filesystems that supports write: you will
712
+ be given a real local open file, and upon close and commit, it will be
713
+ uploaded to the target filesystem; the writability or the target URL is
714
+ not checked until that time.
715
+
716
+ """
717
+
718
+ protocol = "simplecache"
719
+ local_file = True
720
+ transaction_type = WriteCachedTransaction
721
+
722
+ def __init__(self, **kwargs):
723
+ kw = kwargs.copy()
724
+ for key in ["cache_check", "expiry_time", "check_files"]:
725
+ kw[key] = False
726
+ super().__init__(**kw)
727
+ for storage in self.storage:
728
+ if not os.path.exists(storage):
729
+ os.makedirs(storage, exist_ok=True)
730
+
731
+ def _check_file(self, path):
732
+ self._check_cache()
733
+ sha = self._mapper(path)
734
+ for storage in self.storage:
735
+ fn = os.path.join(storage, sha)
736
+ if os.path.exists(fn):
737
+ return fn
738
+
739
+ def save_cache(self):
740
+ pass
741
+
742
+ def load_cache(self):
743
+ pass
744
+
745
+ def pipe_file(self, path, value=None, **kwargs):
746
+ if self._intrans:
747
+ with self.open(path, "wb") as f:
748
+ f.write(value)
749
+ else:
750
+ super().pipe_file(path, value)
751
+
752
+ def pipe(self, path, value=None, **kwargs):
753
+ if isinstance(path, str):
754
+ self.pipe_file(self._strip_protocol(path), value, **kwargs)
755
+ elif isinstance(path, dict):
756
+ for k, v in path.items():
757
+ self.pipe_file(self._strip_protocol(k), v, **kwargs)
758
+ else:
759
+ raise ValueError("path must be str or dict")
760
+
761
+ def cat_ranges(
762
+ self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
763
+ ):
764
+ lpaths = [self._check_file(p) for p in paths]
765
+ rpaths = [p for l, p in zip(lpaths, paths) if l is False]
766
+ lpaths = [l for l, p in zip(lpaths, paths) if l is False]
767
+ self.fs.get(rpaths, lpaths)
768
+ return super().cat_ranges(
769
+ paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
770
+ )
771
+
772
+ def _open(self, path, mode="rb", **kwargs):
773
+ path = self._strip_protocol(path)
774
+ sha = self._mapper(path)
775
+
776
+ if "r" not in mode:
777
+ fn = os.path.join(self.storage[-1], sha)
778
+ return LocalTempFile(
779
+ self, path, mode=mode, autocommit=not self._intrans, fn=fn
780
+ )
781
+ fn = self._check_file(path)
782
+ if fn:
783
+ return open(fn, mode)
784
+
785
+ fn = os.path.join(self.storage[-1], sha)
786
+ logger.debug("Copying %s to local cache", path)
787
+ kwargs["mode"] = mode
788
+
789
+ self._mkcache()
790
+ self._cache_size = None
791
+ if self.compression:
792
+ with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
793
+ if isinstance(f, AbstractBufferedFile):
794
+ # want no type of caching if just downloading whole thing
795
+ f.cache = BaseCache(0, f.cache.fetcher, f.size)
796
+ comp = (
797
+ infer_compression(path)
798
+ if self.compression == "infer"
799
+ else self.compression
800
+ )
801
+ f = compr[comp](f, mode="rb")
802
+ data = True
803
+ while data:
804
+ block = getattr(f, "blocksize", 5 * 2**20)
805
+ data = f.read(block)
806
+ f2.write(data)
807
+ else:
808
+ self.fs.get_file(path, fn)
809
+ return self._open(path, mode)
810
+
811
+
812
+ class LocalTempFile:
813
+ """A temporary local file, which will be uploaded on commit"""
814
+
815
+ def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0):
816
+ self.fn = fn
817
+ self.fh = open(fn, mode)
818
+ self.mode = mode
819
+ if seek:
820
+ self.fh.seek(seek)
821
+ self.path = path
822
+ self.fs = fs
823
+ self.closed = False
824
+ self.autocommit = autocommit
825
+
826
+ def __reduce__(self):
827
+ # always open in r+b to allow continuing writing at a location
828
+ return (
829
+ LocalTempFile,
830
+ (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
831
+ )
832
+
833
+ def __enter__(self):
834
+ return self.fh
835
+
836
+ def __exit__(self, exc_type, exc_val, exc_tb):
837
+ self.close()
838
+
839
+ def close(self):
840
+ if self.closed:
841
+ return
842
+ self.fh.close()
843
+ self.closed = True
844
+ if self.autocommit:
845
+ self.commit()
846
+
847
+ def discard(self):
848
+ self.fh.close()
849
+ os.remove(self.fn)
850
+
851
+ def commit(self):
852
+ self.fs.put(self.fn, self.path)
853
+ try:
854
+ os.remove(self.fn)
855
+ except (PermissionError, FileNotFoundError):
856
+ # file path may be held by new version of the file on windows
857
+ pass
858
+
859
+ @property
860
+ def name(self):
861
+ return self.fn
862
+
863
+ def __getattr__(self, item):
864
+ return getattr(self.fh, item)
lib/python3.11/site-packages/fsspec/implementations/dask.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dask
2
+ from distributed.client import Client, _get_global_client
3
+ from distributed.worker import Worker
4
+
5
+ from fsspec import filesystem
6
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
7
+ from fsspec.utils import infer_storage_options
8
+
9
+
10
+ def _get_client(client):
11
+ if client is None:
12
+ return _get_global_client()
13
+ elif isinstance(client, Client):
14
+ return client
15
+ else:
16
+ # e.g., connection string
17
+ return Client(client)
18
+
19
+
20
+ def _in_worker():
21
+ return bool(Worker._instances)
22
+
23
+
24
+ class DaskWorkerFileSystem(AbstractFileSystem):
25
+ """View files accessible to a worker as any other remote file-system
26
+
27
+ When instances are run on the worker, uses the real filesystem. When
28
+ run on the client, they call the worker to provide information or data.
29
+
30
+ **Warning** this implementation is experimental, and read-only for now.
31
+ """
32
+
33
+ def __init__(
34
+ self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
35
+ ):
36
+ super().__init__(**kwargs)
37
+ if not (fs is None) ^ (target_protocol is None):
38
+ raise ValueError(
39
+ "Please provide one of filesystem instance (fs) or"
40
+ " target_protocol, not both"
41
+ )
42
+ self.target_protocol = target_protocol
43
+ self.target_options = target_options
44
+ self.worker = None
45
+ self.client = client
46
+ self.fs = fs
47
+ self._determine_worker()
48
+
49
+ @staticmethod
50
+ def _get_kwargs_from_urls(path):
51
+ so = infer_storage_options(path)
52
+ if "host" in so and "port" in so:
53
+ return {"client": f"{so['host']}:{so['port']}"}
54
+ else:
55
+ return {}
56
+
57
+ def _determine_worker(self):
58
+ if _in_worker():
59
+ self.worker = True
60
+ if self.fs is None:
61
+ self.fs = filesystem(
62
+ self.target_protocol, **(self.target_options or {})
63
+ )
64
+ else:
65
+ self.worker = False
66
+ self.client = _get_client(self.client)
67
+ self.rfs = dask.delayed(self)
68
+
69
+ def mkdir(self, *args, **kwargs):
70
+ if self.worker:
71
+ self.fs.mkdir(*args, **kwargs)
72
+ else:
73
+ self.rfs.mkdir(*args, **kwargs).compute()
74
+
75
+ def rm(self, *args, **kwargs):
76
+ if self.worker:
77
+ self.fs.rm(*args, **kwargs)
78
+ else:
79
+ self.rfs.rm(*args, **kwargs).compute()
80
+
81
+ def copy(self, *args, **kwargs):
82
+ if self.worker:
83
+ self.fs.copy(*args, **kwargs)
84
+ else:
85
+ self.rfs.copy(*args, **kwargs).compute()
86
+
87
+ def mv(self, *args, **kwargs):
88
+ if self.worker:
89
+ self.fs.mv(*args, **kwargs)
90
+ else:
91
+ self.rfs.mv(*args, **kwargs).compute()
92
+
93
+ def ls(self, *args, **kwargs):
94
+ if self.worker:
95
+ return self.fs.ls(*args, **kwargs)
96
+ else:
97
+ return self.rfs.ls(*args, **kwargs).compute()
98
+
99
+ def _open(
100
+ self,
101
+ path,
102
+ mode="rb",
103
+ block_size=None,
104
+ autocommit=True,
105
+ cache_options=None,
106
+ **kwargs,
107
+ ):
108
+ if self.worker:
109
+ return self.fs._open(
110
+ path,
111
+ mode=mode,
112
+ block_size=block_size,
113
+ autocommit=autocommit,
114
+ cache_options=cache_options,
115
+ **kwargs,
116
+ )
117
+ else:
118
+ return DaskFile(
119
+ fs=self,
120
+ path=path,
121
+ mode=mode,
122
+ block_size=block_size,
123
+ autocommit=autocommit,
124
+ cache_options=cache_options,
125
+ **kwargs,
126
+ )
127
+
128
+ def fetch_range(self, path, mode, start, end):
129
+ if self.worker:
130
+ with self._open(path, mode) as f:
131
+ f.seek(start)
132
+ return f.read(end - start)
133
+ else:
134
+ return self.rfs.fetch_range(path, mode, start, end).compute()
135
+
136
+
137
+ class DaskFile(AbstractBufferedFile):
138
+ def __init__(self, mode="rb", **kwargs):
139
+ if mode != "rb":
140
+ raise ValueError('Remote dask files can only be opened in "rb" mode')
141
+ super().__init__(**kwargs)
142
+
143
+ def _upload_chunk(self, final=False):
144
+ pass
145
+
146
+ def _initiate_upload(self):
147
+ """Create remote file/upload"""
148
+ pass
149
+
150
+ def _fetch_range(self, start, end):
151
+ """Get the specified set of bytes from remote"""
152
+ return self.fs.fetch_range(self.path, self.mode, start, end)
lib/python3.11/site-packages/fsspec/implementations/data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ from urllib.parse import unquote
4
+
5
+ from fsspec import AbstractFileSystem
6
+
7
+
8
+ class DataFileSystem(AbstractFileSystem):
9
+ """A handy decoder for data-URLs
10
+
11
+ Example
12
+ -------
13
+ >>> with fsspec.open("data:,Hello%2C%20World%21") as f:
14
+ ... print(f.read())
15
+ b"Hello, World!"
16
+
17
+ """
18
+
19
+ protocol = "data"
20
+
21
+ def __init__(self, **kwargs):
22
+ """No parameters for this filesystem"""
23
+ super().__init__(**kwargs)
24
+
25
+ def cat_file(self, path, start=None, end=None, **kwargs):
26
+ pref, data = path.split(",", 1)
27
+ if pref.endswith("base64"):
28
+ return base64.b64decode(data)[start:end]
29
+ return unquote(data).encode()[start:end]
30
+
31
+ def info(self, path, **kwargs):
32
+ pref, name = path.split(",", 1)
33
+ data = self.cat_file(path)
34
+ mime = pref.split(":", 1)[1].split(";", 1)[0]
35
+ return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
36
+
37
+ def _open(
38
+ self,
39
+ path,
40
+ mode="rb",
41
+ block_size=None,
42
+ autocommit=True,
43
+ cache_options=None,
44
+ **kwargs,
45
+ ):
46
+ if "r" not in mode:
47
+ raise ValueError("Read only filesystem")
48
+ return io.BytesIO(self.cat_file(path))
lib/python3.11/site-packages/fsspec/implementations/dbfs.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import urllib
3
+
4
+ import requests
5
+
6
+ from fsspec import AbstractFileSystem
7
+ from fsspec.spec import AbstractBufferedFile
8
+
9
+
10
+ class DatabricksException(Exception):
11
+ """
12
+ Helper class for exceptions raised in this module.
13
+ """
14
+
15
+ def __init__(self, error_code, message):
16
+ """Create a new DatabricksException"""
17
+ super().__init__(message)
18
+
19
+ self.error_code = error_code
20
+ self.message = message
21
+
22
+
23
+ class DatabricksFileSystem(AbstractFileSystem):
24
+ """
25
+ Get access to the Databricks filesystem implementation over HTTP.
26
+ Can be used inside and outside of a databricks cluster.
27
+ """
28
+
29
+ def __init__(self, instance, token, **kwargs):
30
+ """
31
+ Create a new DatabricksFileSystem.
32
+
33
+ Parameters
34
+ ----------
35
+ instance: str
36
+ The instance URL of the databricks cluster.
37
+ For example for an Azure databricks cluster, this
38
+ has the form adb-<some-number>.<two digits>.azuredatabricks.net.
39
+ token: str
40
+ Your personal token. Find out more
41
+ here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
42
+ """
43
+ self.instance = instance
44
+ self.token = token
45
+
46
+ self.session = requests.Session()
47
+ self.session.headers.update({"Authorization": f"Bearer {self.token}"})
48
+
49
+ super().__init__(**kwargs)
50
+
51
+ def ls(self, path, detail=True):
52
+ """
53
+ List the contents of the given path.
54
+
55
+ Parameters
56
+ ----------
57
+ path: str
58
+ Absolute path
59
+ detail: bool
60
+ Return not only the list of filenames,
61
+ but also additional information on file sizes
62
+ and types.
63
+ """
64
+ out = self._ls_from_cache(path)
65
+ if not out:
66
+ try:
67
+ r = self._send_to_api(
68
+ method="get", endpoint="list", json={"path": path}
69
+ )
70
+ except DatabricksException as e:
71
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
72
+ raise FileNotFoundError(e.message)
73
+
74
+ raise e
75
+ files = r["files"]
76
+ out = [
77
+ {
78
+ "name": o["path"],
79
+ "type": "directory" if o["is_dir"] else "file",
80
+ "size": o["file_size"],
81
+ }
82
+ for o in files
83
+ ]
84
+ self.dircache[path] = out
85
+
86
+ if detail:
87
+ return out
88
+ return [o["name"] for o in out]
89
+
90
+ def makedirs(self, path, exist_ok=True):
91
+ """
92
+ Create a given absolute path and all of its parents.
93
+
94
+ Parameters
95
+ ----------
96
+ path: str
97
+ Absolute path to create
98
+ exist_ok: bool
99
+ If false, checks if the folder
100
+ exists before creating it (and raises an
101
+ Exception if this is the case)
102
+ """
103
+ if not exist_ok:
104
+ try:
105
+ # If the following succeeds, the path is already present
106
+ self._send_to_api(
107
+ method="get", endpoint="get-status", json={"path": path}
108
+ )
109
+ raise FileExistsError(f"Path {path} already exists")
110
+ except DatabricksException as e:
111
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
112
+ pass
113
+
114
+ try:
115
+ self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
116
+ except DatabricksException as e:
117
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
118
+ raise FileExistsError(e.message)
119
+
120
+ raise e
121
+ self.invalidate_cache(self._parent(path))
122
+
123
+ def mkdir(self, path, create_parents=True, **kwargs):
124
+ """
125
+ Create a given absolute path and all of its parents.
126
+
127
+ Parameters
128
+ ----------
129
+ path: str
130
+ Absolute path to create
131
+ create_parents: bool
132
+ Whether to create all parents or not.
133
+ "False" is not implemented so far.
134
+ """
135
+ if not create_parents:
136
+ raise NotImplementedError
137
+
138
+ self.mkdirs(path, **kwargs)
139
+
140
+ def rm(self, path, recursive=False):
141
+ """
142
+ Remove the file or folder at the given absolute path.
143
+
144
+ Parameters
145
+ ----------
146
+ path: str
147
+ Absolute path what to remove
148
+ recursive: bool
149
+ Recursively delete all files in a folder.
150
+ """
151
+ try:
152
+ self._send_to_api(
153
+ method="post",
154
+ endpoint="delete",
155
+ json={"path": path, "recursive": recursive},
156
+ )
157
+ except DatabricksException as e:
158
+ # This is not really an exception, it just means
159
+ # not everything was deleted so far
160
+ if e.error_code == "PARTIAL_DELETE":
161
+ self.rm(path=path, recursive=recursive)
162
+ elif e.error_code == "IO_ERROR":
163
+ # Using the same exception as the os module would use here
164
+ raise OSError(e.message)
165
+
166
+ raise e
167
+ self.invalidate_cache(self._parent(path))
168
+
169
+ def mv(self, source_path, destination_path, recursive=False, maxdepth=None):
170
+ """
171
+ Move a source to a destination path.
172
+
173
+ A note from the original [databricks API manual]
174
+ (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
175
+
176
+ When moving a large number of files the API call will time out after
177
+ approximately 60s, potentially resulting in partially moved data.
178
+ Therefore, for operations that move more than 10k files, we strongly
179
+ discourage using the DBFS REST API.
180
+
181
+ Parameters
182
+ ----------
183
+ source_path: str
184
+ From where to move (absolute path)
185
+ destination_path: str
186
+ To where to move (absolute path)
187
+ recursive: bool
188
+ Not implemented to far.
189
+ maxdepth:
190
+ Not implemented to far.
191
+ """
192
+ if recursive:
193
+ raise NotImplementedError
194
+ if maxdepth:
195
+ raise NotImplementedError
196
+
197
+ try:
198
+ self._send_to_api(
199
+ method="post",
200
+ endpoint="move",
201
+ json={"source_path": source_path, "destination_path": destination_path},
202
+ )
203
+ except DatabricksException as e:
204
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
205
+ raise FileNotFoundError(e.message)
206
+ elif e.error_code == "RESOURCE_ALREADY_EXISTS":
207
+ raise FileExistsError(e.message)
208
+
209
+ raise e
210
+ self.invalidate_cache(self._parent(source_path))
211
+ self.invalidate_cache(self._parent(destination_path))
212
+
213
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
214
+ """
215
+ Overwrite the base class method to make sure to create a DBFile.
216
+ All arguments are copied from the base method.
217
+
218
+ Only the default blocksize is allowed.
219
+ """
220
+ return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
221
+
222
+ def _send_to_api(self, method, endpoint, json):
223
+ """
224
+ Send the given json to the DBFS API
225
+ using a get or post request (specified by the argument `method`).
226
+
227
+ Parameters
228
+ ----------
229
+ method: str
230
+ Which http method to use for communication; "get" or "post".
231
+ endpoint: str
232
+ Where to send the request to (last part of the API URL)
233
+ json: dict
234
+ Dictionary of information to send
235
+ """
236
+ if method == "post":
237
+ session_call = self.session.post
238
+ elif method == "get":
239
+ session_call = self.session.get
240
+ else:
241
+ raise ValueError(f"Do not understand method {method}")
242
+
243
+ url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
244
+
245
+ r = session_call(url, json=json)
246
+
247
+ # The DBFS API will return a json, also in case of an exception.
248
+ # We want to preserve this information as good as possible.
249
+ try:
250
+ r.raise_for_status()
251
+ except requests.HTTPError as e:
252
+ # try to extract json error message
253
+ # if that fails, fall back to the original exception
254
+ try:
255
+ exception_json = e.response.json()
256
+ except Exception:
257
+ raise e
258
+
259
+ raise DatabricksException(**exception_json)
260
+
261
+ return r.json()
262
+
263
+ def _create_handle(self, path, overwrite=True):
264
+ """
265
+ Internal function to create a handle, which can be used to
266
+ write blocks of a file to DBFS.
267
+ A handle has a unique identifier which needs to be passed
268
+ whenever written during this transaction.
269
+ The handle is active for 10 minutes - after that a new
270
+ write transaction needs to be created.
271
+ Make sure to close the handle after you are finished.
272
+
273
+ Parameters
274
+ ----------
275
+ path: str
276
+ Absolute path for this file.
277
+ overwrite: bool
278
+ If a file already exist at this location, either overwrite
279
+ it or raise an exception.
280
+ """
281
+ try:
282
+ r = self._send_to_api(
283
+ method="post",
284
+ endpoint="create",
285
+ json={"path": path, "overwrite": overwrite},
286
+ )
287
+ return r["handle"]
288
+ except DatabricksException as e:
289
+ if e.error_code == "RESOURCE_ALREADY_EXISTS":
290
+ raise FileExistsError(e.message)
291
+
292
+ raise e
293
+
294
+ def _close_handle(self, handle):
295
+ """
296
+ Close a handle, which was opened by :func:`_create_handle`.
297
+
298
+ Parameters
299
+ ----------
300
+ handle: str
301
+ Which handle to close.
302
+ """
303
+ try:
304
+ self._send_to_api(method="post", endpoint="close", json={"handle": handle})
305
+ except DatabricksException as e:
306
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
307
+ raise FileNotFoundError(e.message)
308
+
309
+ raise e
310
+
311
+ def _add_data(self, handle, data):
312
+ """
313
+ Upload data to an already opened file handle
314
+ (opened by :func:`_create_handle`).
315
+ The maximal allowed data size is 1MB after
316
+ conversion to base64.
317
+ Remember to close the handle when you are finished.
318
+
319
+ Parameters
320
+ ----------
321
+ handle: str
322
+ Which handle to upload data to.
323
+ data: bytes
324
+ Block of data to add to the handle.
325
+ """
326
+ data = base64.b64encode(data).decode()
327
+ try:
328
+ self._send_to_api(
329
+ method="post",
330
+ endpoint="add-block",
331
+ json={"handle": handle, "data": data},
332
+ )
333
+ except DatabricksException as e:
334
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
335
+ raise FileNotFoundError(e.message)
336
+ elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
337
+ raise ValueError(e.message)
338
+
339
+ raise e
340
+
341
+ def _get_data(self, path, start, end):
342
+ """
343
+ Download data in bytes from a given absolute path in a block
344
+ from [start, start+length].
345
+ The maximum number of allowed bytes to read is 1MB.
346
+
347
+ Parameters
348
+ ----------
349
+ path: str
350
+ Absolute path to download data from
351
+ start: int
352
+ Start position of the block
353
+ end: int
354
+ End position of the block
355
+ """
356
+ try:
357
+ r = self._send_to_api(
358
+ method="get",
359
+ endpoint="read",
360
+ json={"path": path, "offset": start, "length": end - start},
361
+ )
362
+ return base64.b64decode(r["data"])
363
+ except DatabricksException as e:
364
+ if e.error_code == "RESOURCE_DOES_NOT_EXIST":
365
+ raise FileNotFoundError(e.message)
366
+ elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
367
+ raise ValueError(e.message)
368
+
369
+ raise e
370
+
371
+ def invalidate_cache(self, path=None):
372
+ if path is None:
373
+ self.dircache.clear()
374
+ else:
375
+ self.dircache.pop(path, None)
376
+ super().invalidate_cache(path)
377
+
378
+
379
+ class DatabricksFile(AbstractBufferedFile):
380
+ """
381
+ Helper class for files referenced in the DatabricksFileSystem.
382
+ """
383
+
384
+ DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
385
+
386
+ def __init__(
387
+ self,
388
+ fs,
389
+ path,
390
+ mode="rb",
391
+ block_size="default",
392
+ autocommit=True,
393
+ cache_type="readahead",
394
+ cache_options=None,
395
+ **kwargs,
396
+ ):
397
+ """
398
+ Create a new instance of the DatabricksFile.
399
+
400
+ The blocksize needs to be the default one.
401
+ """
402
+ if block_size is None or block_size == "default":
403
+ block_size = self.DEFAULT_BLOCK_SIZE
404
+
405
+ assert (
406
+ block_size == self.DEFAULT_BLOCK_SIZE
407
+ ), f"Only the default block size is allowed, not {block_size}"
408
+
409
+ super().__init__(
410
+ fs,
411
+ path,
412
+ mode=mode,
413
+ block_size=block_size,
414
+ autocommit=autocommit,
415
+ cache_type=cache_type,
416
+ cache_options=cache_options or {},
417
+ **kwargs,
418
+ )
419
+
420
+ def _initiate_upload(self):
421
+ """Internal function to start a file upload"""
422
+ self.handle = self.fs._create_handle(self.path)
423
+
424
+ def _upload_chunk(self, final=False):
425
+ """Internal function to add a chunk of data to a started upload"""
426
+ self.buffer.seek(0)
427
+ data = self.buffer.getvalue()
428
+
429
+ data_chunks = [
430
+ data[start:end] for start, end in self._to_sized_blocks(len(data))
431
+ ]
432
+
433
+ for data_chunk in data_chunks:
434
+ self.fs._add_data(handle=self.handle, data=data_chunk)
435
+
436
+ if final:
437
+ self.fs._close_handle(handle=self.handle)
438
+ return True
439
+
440
+ def _fetch_range(self, start, end):
441
+ """Internal function to download a block of data"""
442
+ return_buffer = b""
443
+ length = end - start
444
+ for chunk_start, chunk_end in self._to_sized_blocks(length, start):
445
+ return_buffer += self.fs._get_data(
446
+ path=self.path, start=chunk_start, end=chunk_end
447
+ )
448
+
449
+ return return_buffer
450
+
451
+ def _to_sized_blocks(self, length, start=0):
452
+ """Helper function to split a range from 0 to total_length into bloksizes"""
453
+ end = start + length
454
+ for data_chunk in range(start, end, self.blocksize):
455
+ data_start = data_chunk
456
+ data_end = min(end, data_chunk + self.blocksize)
457
+ yield data_start, data_end
lib/python3.11/site-packages/fsspec/implementations/dirfs.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import filesystem
2
+ from ..asyn import AsyncFileSystem
3
+
4
+
5
+ class DirFileSystem(AsyncFileSystem):
6
+ """Directory prefix filesystem
7
+
8
+ The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
9
+ is relative to the `path`. After performing the necessary paths operation it
10
+ delegates everything to the wrapped filesystem.
11
+ """
12
+
13
+ protocol = "dir"
14
+
15
+ def __init__(
16
+ self,
17
+ path=None,
18
+ fs=None,
19
+ fo=None,
20
+ target_protocol=None,
21
+ target_options=None,
22
+ **storage_options,
23
+ ):
24
+ """
25
+ Parameters
26
+ ----------
27
+ path: str
28
+ Path to the directory.
29
+ fs: AbstractFileSystem
30
+ An instantiated filesystem to wrap.
31
+ target_protocol, target_options:
32
+ if fs is none, construct it from these
33
+ fo: str
34
+ Alternate for path; do not provide both
35
+ """
36
+ super().__init__(**storage_options)
37
+ if fs is None:
38
+ fs = filesystem(protocol=target_protocol, **(target_options or {}))
39
+ if (path is not None) ^ (fo is not None) is False:
40
+ raise ValueError("Provide path or fo, not both")
41
+ path = path or fo
42
+
43
+ if self.asynchronous and not fs.async_impl:
44
+ raise ValueError("can't use asynchronous with non-async fs")
45
+
46
+ if fs.async_impl and self.asynchronous != fs.asynchronous:
47
+ raise ValueError("both dirfs and fs should be in the same sync/async mode")
48
+
49
+ self.path = fs._strip_protocol(path)
50
+ self.fs = fs
51
+
52
+ def _join(self, path):
53
+ if isinstance(path, str):
54
+ if not self.path:
55
+ return path
56
+ if not path:
57
+ return self.path
58
+ return self.fs.sep.join((self.path, self._strip_protocol(path)))
59
+ return [self._join(_path) for _path in path]
60
+
61
+ def _relpath(self, path):
62
+ if isinstance(path, str):
63
+ if not self.path:
64
+ return path
65
+ if path == self.path:
66
+ return ""
67
+ prefix = self.path + self.fs.sep
68
+ assert path.startswith(prefix)
69
+ return path[len(prefix) :]
70
+ return [self._relpath(_path) for _path in path]
71
+
72
+ # Wrappers below
73
+
74
+ @property
75
+ def sep(self):
76
+ return self.fs.sep
77
+
78
+ async def set_session(self, *args, **kwargs):
79
+ return await self.fs.set_session(*args, **kwargs)
80
+
81
+ async def _rm_file(self, path, **kwargs):
82
+ return await self.fs._rm_file(self._join(path), **kwargs)
83
+
84
+ def rm_file(self, path, **kwargs):
85
+ return self.fs.rm_file(self._join(path), **kwargs)
86
+
87
+ async def _rm(self, path, *args, **kwargs):
88
+ return await self.fs._rm(self._join(path), *args, **kwargs)
89
+
90
+ def rm(self, path, *args, **kwargs):
91
+ return self.fs.rm(self._join(path), *args, **kwargs)
92
+
93
+ async def _cp_file(self, path1, path2, **kwargs):
94
+ return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
95
+
96
+ def cp_file(self, path1, path2, **kwargs):
97
+ return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
98
+
99
+ async def _copy(
100
+ self,
101
+ path1,
102
+ path2,
103
+ *args,
104
+ **kwargs,
105
+ ):
106
+ return await self.fs._copy(
107
+ self._join(path1),
108
+ self._join(path2),
109
+ *args,
110
+ **kwargs,
111
+ )
112
+
113
+ def copy(self, path1, path2, *args, **kwargs):
114
+ return self.fs.copy(
115
+ self._join(path1),
116
+ self._join(path2),
117
+ *args,
118
+ **kwargs,
119
+ )
120
+
121
+ async def _pipe(self, path, *args, **kwargs):
122
+ return await self.fs._pipe(self._join(path), *args, **kwargs)
123
+
124
+ def pipe(self, path, *args, **kwargs):
125
+ return self.fs.pipe(self._join(path), *args, **kwargs)
126
+
127
+ async def _cat_file(self, path, *args, **kwargs):
128
+ return await self.fs._cat_file(self._join(path), *args, **kwargs)
129
+
130
+ def cat_file(self, path, *args, **kwargs):
131
+ return self.fs.cat_file(self._join(path), *args, **kwargs)
132
+
133
+ async def _cat(self, path, *args, **kwargs):
134
+ ret = await self.fs._cat(
135
+ self._join(path),
136
+ *args,
137
+ **kwargs,
138
+ )
139
+
140
+ if isinstance(ret, dict):
141
+ return {self._relpath(key): value for key, value in ret.items()}
142
+
143
+ return ret
144
+
145
+ def cat(self, path, *args, **kwargs):
146
+ ret = self.fs.cat(
147
+ self._join(path),
148
+ *args,
149
+ **kwargs,
150
+ )
151
+
152
+ if isinstance(ret, dict):
153
+ return {self._relpath(key): value for key, value in ret.items()}
154
+
155
+ return ret
156
+
157
+ async def _put_file(self, lpath, rpath, **kwargs):
158
+ return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
159
+
160
+ def put_file(self, lpath, rpath, **kwargs):
161
+ return self.fs.put_file(lpath, self._join(rpath), **kwargs)
162
+
163
+ async def _put(
164
+ self,
165
+ lpath,
166
+ rpath,
167
+ *args,
168
+ **kwargs,
169
+ ):
170
+ return await self.fs._put(
171
+ lpath,
172
+ self._join(rpath),
173
+ *args,
174
+ **kwargs,
175
+ )
176
+
177
+ def put(self, lpath, rpath, *args, **kwargs):
178
+ return self.fs.put(
179
+ lpath,
180
+ self._join(rpath),
181
+ *args,
182
+ **kwargs,
183
+ )
184
+
185
+ async def _get_file(self, rpath, lpath, **kwargs):
186
+ return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
187
+
188
+ def get_file(self, rpath, lpath, **kwargs):
189
+ return self.fs.get_file(self._join(rpath), lpath, **kwargs)
190
+
191
+ async def _get(self, rpath, *args, **kwargs):
192
+ return await self.fs._get(self._join(rpath), *args, **kwargs)
193
+
194
+ def get(self, rpath, *args, **kwargs):
195
+ return self.fs.get(self._join(rpath), *args, **kwargs)
196
+
197
+ async def _isfile(self, path):
198
+ return await self.fs._isfile(self._join(path))
199
+
200
+ def isfile(self, path):
201
+ return self.fs.isfile(self._join(path))
202
+
203
+ async def _isdir(self, path):
204
+ return await self.fs._isdir(self._join(path))
205
+
206
+ def isdir(self, path):
207
+ return self.fs.isdir(self._join(path))
208
+
209
+ async def _size(self, path):
210
+ return await self.fs._size(self._join(path))
211
+
212
+ def size(self, path):
213
+ return self.fs.size(self._join(path))
214
+
215
+ async def _exists(self, path):
216
+ return await self.fs._exists(self._join(path))
217
+
218
+ def exists(self, path):
219
+ return self.fs.exists(self._join(path))
220
+
221
+ async def _info(self, path, **kwargs):
222
+ return await self.fs._info(self._join(path), **kwargs)
223
+
224
+ def info(self, path, **kwargs):
225
+ return self.fs.info(self._join(path), **kwargs)
226
+
227
+ async def _ls(self, path, detail=True, **kwargs):
228
+ ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
229
+ if detail:
230
+ out = []
231
+ for entry in ret:
232
+ entry = entry.copy()
233
+ entry["name"] = self._relpath(entry["name"])
234
+ out.append(entry)
235
+ return out
236
+
237
+ return self._relpath(ret)
238
+
239
+ def ls(self, path, detail=True, **kwargs):
240
+ ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
241
+ if detail:
242
+ out = []
243
+ for entry in ret:
244
+ entry = entry.copy()
245
+ entry["name"] = self._relpath(entry["name"])
246
+ out.append(entry)
247
+ return out
248
+
249
+ return self._relpath(ret)
250
+
251
+ async def _walk(self, path, *args, **kwargs):
252
+ async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
253
+ yield self._relpath(root), dirs, files
254
+
255
+ def walk(self, path, *args, **kwargs):
256
+ for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
257
+ yield self._relpath(root), dirs, files
258
+
259
+ async def _glob(self, path, **kwargs):
260
+ detail = kwargs.get("detail", False)
261
+ ret = await self.fs._glob(self._join(path), **kwargs)
262
+ if detail:
263
+ return {self._relpath(path): info for path, info in ret.items()}
264
+ return self._relpath(ret)
265
+
266
+ def glob(self, path, **kwargs):
267
+ detail = kwargs.get("detail", False)
268
+ ret = self.fs.glob(self._join(path), **kwargs)
269
+ if detail:
270
+ return {self._relpath(path): info for path, info in ret.items()}
271
+ return self._relpath(ret)
272
+
273
+ async def _du(self, path, *args, **kwargs):
274
+ total = kwargs.get("total", True)
275
+ ret = await self.fs._du(self._join(path), *args, **kwargs)
276
+ if total:
277
+ return ret
278
+
279
+ return {self._relpath(path): size for path, size in ret.items()}
280
+
281
+ def du(self, path, *args, **kwargs):
282
+ total = kwargs.get("total", True)
283
+ ret = self.fs.du(self._join(path), *args, **kwargs)
284
+ if total:
285
+ return ret
286
+
287
+ return {self._relpath(path): size for path, size in ret.items()}
288
+
289
+ async def _find(self, path, *args, **kwargs):
290
+ detail = kwargs.get("detail", False)
291
+ ret = await self.fs._find(self._join(path), *args, **kwargs)
292
+ if detail:
293
+ return {self._relpath(path): info for path, info in ret.items()}
294
+ return self._relpath(ret)
295
+
296
+ def find(self, path, *args, **kwargs):
297
+ detail = kwargs.get("detail", False)
298
+ ret = self.fs.find(self._join(path), *args, **kwargs)
299
+ if detail:
300
+ return {self._relpath(path): info for path, info in ret.items()}
301
+ return self._relpath(ret)
302
+
303
+ async def _expand_path(self, path, *args, **kwargs):
304
+ return self._relpath(
305
+ await self.fs._expand_path(self._join(path), *args, **kwargs)
306
+ )
307
+
308
+ def expand_path(self, path, *args, **kwargs):
309
+ return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
310
+
311
+ async def _mkdir(self, path, *args, **kwargs):
312
+ return await self.fs._mkdir(self._join(path), *args, **kwargs)
313
+
314
+ def mkdir(self, path, *args, **kwargs):
315
+ return self.fs.mkdir(self._join(path), *args, **kwargs)
316
+
317
+ async def _makedirs(self, path, *args, **kwargs):
318
+ return await self.fs._makedirs(self._join(path), *args, **kwargs)
319
+
320
+ def makedirs(self, path, *args, **kwargs):
321
+ return self.fs.makedirs(self._join(path), *args, **kwargs)
322
+
323
+ def rmdir(self, path):
324
+ return self.fs.rmdir(self._join(path))
325
+
326
+ def mv_file(self, path1, path2, **kwargs):
327
+ return self.fs.mv_file(
328
+ self._join(path1),
329
+ self._join(path2),
330
+ **kwargs,
331
+ )
332
+
333
+ def touch(self, path, **kwargs):
334
+ return self.fs.touch(self._join(path), **kwargs)
335
+
336
+ def created(self, path):
337
+ return self.fs.created(self._join(path))
338
+
339
+ def modified(self, path):
340
+ return self.fs.modified(self._join(path))
341
+
342
+ def sign(self, path, *args, **kwargs):
343
+ return self.fs.sign(self._join(path), *args, **kwargs)
344
+
345
+ def __repr__(self):
346
+ return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
347
+
348
+ def open(
349
+ self,
350
+ path,
351
+ *args,
352
+ **kwargs,
353
+ ):
354
+ return self.fs.open(
355
+ self._join(path),
356
+ *args,
357
+ **kwargs,
358
+ )
lib/python3.11/site-packages/fsspec/implementations/ftp.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import uuid
4
+ import warnings
5
+ from ftplib import FTP, Error, error_perm
6
+ from typing import Any
7
+
8
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
9
+ from ..utils import infer_storage_options, isfilelike
10
+
11
+
12
+ class FTPFileSystem(AbstractFileSystem):
13
+ """A filesystem over classic FTP"""
14
+
15
+ root_marker = "/"
16
+ cachable = False
17
+ protocol = "ftp"
18
+
19
+ def __init__(
20
+ self,
21
+ host,
22
+ port=21,
23
+ username=None,
24
+ password=None,
25
+ acct=None,
26
+ block_size=None,
27
+ tempdir=None,
28
+ timeout=30,
29
+ encoding="utf-8",
30
+ **kwargs,
31
+ ):
32
+ """
33
+ You can use _get_kwargs_from_urls to get some kwargs from
34
+ a reasonable FTP url.
35
+
36
+ Authentication will be anonymous if username/password are not
37
+ given.
38
+
39
+ Parameters
40
+ ----------
41
+ host: str
42
+ The remote server name/ip to connect to
43
+ port: int
44
+ Port to connect with
45
+ username: str or None
46
+ If authenticating, the user's identifier
47
+ password: str of None
48
+ User's password on the server, if using
49
+ acct: str or None
50
+ Some servers also need an "account" string for auth
51
+ block_size: int or None
52
+ If given, the read-ahead or write buffer size.
53
+ tempdir: str
54
+ Directory on remote to put temporary files when in a transaction
55
+ timeout: int
56
+ Timeout of the ftp connection in seconds
57
+ encoding: str
58
+ Encoding to use for directories and filenames in FTP connection
59
+ """
60
+ super().__init__(**kwargs)
61
+ self.host = host
62
+ self.port = port
63
+ self.tempdir = tempdir or "/tmp"
64
+ self.cred = username, password, acct
65
+ self.timeout = timeout
66
+ self.encoding = encoding
67
+ if block_size is not None:
68
+ self.blocksize = block_size
69
+ else:
70
+ self.blocksize = 2**16
71
+ self._connect()
72
+
73
+ def _connect(self):
74
+ if sys.version_info >= (3, 9):
75
+ self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
76
+ elif self.encoding:
77
+ warnings.warn("`encoding` not supported for python<3.9, ignoring")
78
+ self.ftp = FTP(timeout=self.timeout)
79
+ else:
80
+ self.ftp = FTP(timeout=self.timeout)
81
+ self.ftp.connect(self.host, self.port)
82
+ self.ftp.login(*self.cred)
83
+
84
+ @classmethod
85
+ def _strip_protocol(cls, path):
86
+ return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
87
+
88
+ @staticmethod
89
+ def _get_kwargs_from_urls(urlpath):
90
+ out = infer_storage_options(urlpath)
91
+ out.pop("path", None)
92
+ out.pop("protocol", None)
93
+ return out
94
+
95
+ def ls(self, path, detail=True, **kwargs):
96
+ path = self._strip_protocol(path)
97
+ out = []
98
+ if path not in self.dircache:
99
+ try:
100
+ try:
101
+ out = [
102
+ (fn, details)
103
+ for (fn, details) in self.ftp.mlsd(path)
104
+ if fn not in [".", ".."]
105
+ and details["type"] not in ["pdir", "cdir"]
106
+ ]
107
+ except error_perm:
108
+ out = _mlsd2(self.ftp, path) # Not platform independent
109
+ for fn, details in out:
110
+ if path == "/":
111
+ path = "" # just for forming the names, below
112
+ details["name"] = "/".join([path, fn.lstrip("/")])
113
+ if details["type"] == "file":
114
+ details["size"] = int(details["size"])
115
+ else:
116
+ details["size"] = 0
117
+ if details["type"] == "dir":
118
+ details["type"] = "directory"
119
+ self.dircache[path] = out
120
+ except Error:
121
+ try:
122
+ info = self.info(path)
123
+ if info["type"] == "file":
124
+ out = [(path, info)]
125
+ except (Error, IndexError):
126
+ raise FileNotFoundError(path)
127
+ files = self.dircache.get(path, out)
128
+ if not detail:
129
+ return sorted([fn for fn, details in files])
130
+ return [details for fn, details in files]
131
+
132
+ def info(self, path, **kwargs):
133
+ # implement with direct method
134
+ path = self._strip_protocol(path)
135
+ if path == "/":
136
+ # special case, since this dir has no real entry
137
+ return {"name": "/", "size": 0, "type": "directory"}
138
+ files = self.ls(self._parent(path).lstrip("/"), True)
139
+ try:
140
+ out = [f for f in files if f["name"] == path][0]
141
+ except IndexError:
142
+ raise FileNotFoundError(path)
143
+ return out
144
+
145
+ def get_file(self, rpath, lpath, **kwargs):
146
+ if self.isdir(rpath):
147
+ if not os.path.exists(lpath):
148
+ os.mkdir(lpath)
149
+ return
150
+ if isfilelike(lpath):
151
+ outfile = lpath
152
+ else:
153
+ outfile = open(lpath, "wb")
154
+
155
+ def cb(x):
156
+ outfile.write(x)
157
+
158
+ self.ftp.retrbinary(
159
+ f"RETR {rpath}",
160
+ blocksize=self.blocksize,
161
+ callback=cb,
162
+ )
163
+ if not isfilelike(lpath):
164
+ outfile.close()
165
+
166
+ def cat_file(self, path, start=None, end=None, **kwargs):
167
+ if end is not None:
168
+ return super().cat_file(path, start, end, **kwargs)
169
+ out = []
170
+
171
+ def cb(x):
172
+ out.append(x)
173
+
174
+ self.ftp.retrbinary(
175
+ f"RETR {path}",
176
+ blocksize=self.blocksize,
177
+ rest=start,
178
+ callback=cb,
179
+ )
180
+ return b"".join(out)
181
+
182
+ def _open(
183
+ self,
184
+ path,
185
+ mode="rb",
186
+ block_size=None,
187
+ cache_options=None,
188
+ autocommit=True,
189
+ **kwargs,
190
+ ):
191
+ path = self._strip_protocol(path)
192
+ block_size = block_size or self.blocksize
193
+ return FTPFile(
194
+ self,
195
+ path,
196
+ mode=mode,
197
+ block_size=block_size,
198
+ tempdir=self.tempdir,
199
+ autocommit=autocommit,
200
+ cache_options=cache_options,
201
+ )
202
+
203
+ def _rm(self, path):
204
+ path = self._strip_protocol(path)
205
+ self.ftp.delete(path)
206
+ self.invalidate_cache(self._parent(path))
207
+
208
+ def rm(self, path, recursive=False, maxdepth=None):
209
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
210
+ for p in reversed(paths):
211
+ if self.isfile(p):
212
+ self.rm_file(p)
213
+ else:
214
+ self.rmdir(p)
215
+
216
+ def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
217
+ path = self._strip_protocol(path)
218
+ parent = self._parent(path)
219
+ if parent != self.root_marker and not self.exists(parent) and create_parents:
220
+ self.mkdir(parent, create_parents=create_parents)
221
+
222
+ self.ftp.mkd(path)
223
+ self.invalidate_cache(self._parent(path))
224
+
225
+ def makedirs(self, path: str, exist_ok: bool = False) -> None:
226
+ path = self._strip_protocol(path)
227
+ if self.exists(path):
228
+ # NB: "/" does not "exist" as it has no directory entry
229
+ if not exist_ok:
230
+ raise FileExistsError(f"{path} exists without `exist_ok`")
231
+ # exists_ok=True -> no-op
232
+ else:
233
+ self.mkdir(path, create_parents=True)
234
+
235
+ def rmdir(self, path):
236
+ path = self._strip_protocol(path)
237
+ self.ftp.rmd(path)
238
+ self.invalidate_cache(self._parent(path))
239
+
240
+ def mv(self, path1, path2, **kwargs):
241
+ path1 = self._strip_protocol(path1)
242
+ path2 = self._strip_protocol(path2)
243
+ self.ftp.rename(path1, path2)
244
+ self.invalidate_cache(self._parent(path1))
245
+ self.invalidate_cache(self._parent(path2))
246
+
247
+ def __del__(self):
248
+ self.ftp.close()
249
+
250
+ def invalidate_cache(self, path=None):
251
+ if path is None:
252
+ self.dircache.clear()
253
+ else:
254
+ self.dircache.pop(path, None)
255
+ super().invalidate_cache(path)
256
+
257
+
258
+ class TransferDone(Exception):
259
+ """Internal exception to break out of transfer"""
260
+
261
+ pass
262
+
263
+
264
+ class FTPFile(AbstractBufferedFile):
265
+ """Interact with a remote FTP file with read/write buffering"""
266
+
267
+ def __init__(
268
+ self,
269
+ fs,
270
+ path,
271
+ mode="rb",
272
+ block_size="default",
273
+ autocommit=True,
274
+ cache_type="readahead",
275
+ cache_options=None,
276
+ **kwargs,
277
+ ):
278
+ super().__init__(
279
+ fs,
280
+ path,
281
+ mode=mode,
282
+ block_size=block_size,
283
+ autocommit=autocommit,
284
+ cache_type=cache_type,
285
+ cache_options=cache_options,
286
+ **kwargs,
287
+ )
288
+ if not autocommit:
289
+ self.target = self.path
290
+ self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
291
+
292
+ def commit(self):
293
+ self.fs.mv(self.path, self.target)
294
+
295
+ def discard(self):
296
+ self.fs.rm(self.path)
297
+
298
+ def _fetch_range(self, start, end):
299
+ """Get bytes between given byte limits
300
+
301
+ Implemented by raising an exception in the fetch callback when the
302
+ number of bytes received reaches the requested amount.
303
+
304
+ Will fail if the server does not respect the REST command on
305
+ retrieve requests.
306
+ """
307
+ out = []
308
+ total = [0]
309
+
310
+ def callback(x):
311
+ total[0] += len(x)
312
+ if total[0] > end - start:
313
+ out.append(x[: (end - start) - total[0]])
314
+ if end < self.size:
315
+ raise TransferDone
316
+ else:
317
+ out.append(x)
318
+
319
+ if total[0] == end - start and end < self.size:
320
+ raise TransferDone
321
+
322
+ try:
323
+ self.fs.ftp.retrbinary(
324
+ f"RETR {self.path}",
325
+ blocksize=self.blocksize,
326
+ rest=start,
327
+ callback=callback,
328
+ )
329
+ except TransferDone:
330
+ try:
331
+ # stop transfer, we got enough bytes for this block
332
+ self.fs.ftp.abort()
333
+ self.fs.ftp.getmultiline()
334
+ except Error:
335
+ self.fs._connect()
336
+
337
+ return b"".join(out)
338
+
339
+ def _upload_chunk(self, final=False):
340
+ self.buffer.seek(0)
341
+ self.fs.ftp.storbinary(
342
+ f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
343
+ )
344
+ return True
345
+
346
+
347
+ def _mlsd2(ftp, path="."):
348
+ """
349
+ Fall back to using `dir` instead of `mlsd` if not supported.
350
+
351
+ This parses a Linux style `ls -l` response to `dir`, but the response may
352
+ be platform dependent.
353
+
354
+ Parameters
355
+ ----------
356
+ ftp: ftplib.FTP
357
+ path: str
358
+ Expects to be given path, but defaults to ".".
359
+ """
360
+ lines = []
361
+ minfo = []
362
+ ftp.dir(path, lines.append)
363
+ for line in lines:
364
+ line = line.split()
365
+ this = (
366
+ line[-1],
367
+ {
368
+ "modify": " ".join(line[5:8]),
369
+ "unix.owner": line[2],
370
+ "unix.group": line[3],
371
+ "unix.mode": line[0],
372
+ "size": line[4],
373
+ },
374
+ )
375
+ if "d" == this[1]["unix.mode"][0]:
376
+ this[1]["type"] = "dir"
377
+ else:
378
+ this[1]["type"] = "file"
379
+ minfo.append(this)
380
+ return minfo
lib/python3.11/site-packages/fsspec/implementations/git.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pygit2
4
+
5
+ from fsspec.spec import AbstractFileSystem
6
+
7
+ from .memory import MemoryFile
8
+
9
+
10
+ class GitFileSystem(AbstractFileSystem):
11
+ """Browse the files of a local git repo at any hash/tag/branch
12
+
13
+ (experimental backend)
14
+ """
15
+
16
+ root_marker = ""
17
+ cachable = True
18
+
19
+ def __init__(self, path=None, fo=None, ref=None, **kwargs):
20
+ """
21
+
22
+ Parameters
23
+ ----------
24
+ path: str (optional)
25
+ Local location of the repo (uses current directory if not given).
26
+ May be deprecated in favour of ``fo``. When used with a higher
27
+ level function such as fsspec.open(), may be of the form
28
+ "git://[path-to-repo[:]][ref@]path/to/file" (but the actual
29
+ file path should not contain "@" or ":").
30
+ fo: str (optional)
31
+ Same as ``path``, but passed as part of a chained URL. This one
32
+ takes precedence if both are given.
33
+ ref: str (optional)
34
+ Reference to work with, could be a hash, tag or branch name. Defaults
35
+ to current working tree. Note that ``ls`` and ``open`` also take hash,
36
+ so this becomes the default for those operations
37
+ kwargs
38
+ """
39
+ super().__init__(**kwargs)
40
+ self.repo = pygit2.Repository(fo or path or os.getcwd())
41
+ self.ref = ref or "master"
42
+
43
+ @classmethod
44
+ def _strip_protocol(cls, path):
45
+ path = super()._strip_protocol(path).lstrip("/")
46
+ if ":" in path:
47
+ path = path.split(":", 1)[1]
48
+ if "@" in path:
49
+ path = path.split("@", 1)[1]
50
+ return path.lstrip("/")
51
+
52
+ def _path_to_object(self, path, ref):
53
+ comm, ref = self.repo.resolve_refish(ref or self.ref)
54
+ parts = path.split("/")
55
+ tree = comm.tree
56
+ for part in parts:
57
+ if part and isinstance(tree, pygit2.Tree):
58
+ tree = tree[part]
59
+ return tree
60
+
61
+ @staticmethod
62
+ def _get_kwargs_from_urls(path):
63
+ if path.startswith("git://"):
64
+ path = path[6:]
65
+ out = {}
66
+ if ":" in path:
67
+ out["path"], path = path.split(":", 1)
68
+ if "@" in path:
69
+ out["ref"], path = path.split("@", 1)
70
+ return out
71
+
72
+ def ls(self, path, detail=True, ref=None, **kwargs):
73
+ path = self._strip_protocol(path)
74
+ tree = self._path_to_object(path, ref)
75
+ if isinstance(tree, pygit2.Tree):
76
+ out = []
77
+ for obj in tree:
78
+ if isinstance(obj, pygit2.Tree):
79
+ out.append(
80
+ {
81
+ "type": "directory",
82
+ "name": "/".join([path, obj.name]).lstrip("/"),
83
+ "hex": obj.hex,
84
+ "mode": f"{obj.filemode:o}",
85
+ "size": 0,
86
+ }
87
+ )
88
+ else:
89
+ out.append(
90
+ {
91
+ "type": "file",
92
+ "name": "/".join([path, obj.name]).lstrip("/"),
93
+ "hex": obj.hex,
94
+ "mode": f"{obj.filemode:o}",
95
+ "size": obj.size,
96
+ }
97
+ )
98
+ else:
99
+ obj = tree
100
+ out = [
101
+ {
102
+ "type": "file",
103
+ "name": obj.name,
104
+ "hex": obj.hex,
105
+ "mode": f"{obj.filemode:o}",
106
+ "size": obj.size,
107
+ }
108
+ ]
109
+ if detail:
110
+ return out
111
+ return [o["name"] for o in out]
112
+
113
+ def ukey(self, path, ref=None):
114
+ return self.info(path, ref=ref)["hex"]
115
+
116
+ def _open(
117
+ self,
118
+ path,
119
+ mode="rb",
120
+ block_size=None,
121
+ autocommit=True,
122
+ cache_options=None,
123
+ ref=None,
124
+ **kwargs,
125
+ ):
126
+ obj = self._path_to_object(path, ref or self.ref)
127
+ return MemoryFile(data=obj.data)
lib/python3.11/site-packages/fsspec/implementations/github.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ from ..spec import AbstractFileSystem
4
+ from ..utils import infer_storage_options
5
+ from .memory import MemoryFile
6
+
7
+ # TODO: add GIST backend, would be very similar
8
+
9
+
10
+ class GithubFileSystem(AbstractFileSystem):
11
+ """Interface to files in github
12
+
13
+ An instance of this class provides the files residing within a remote github
14
+ repository. You may specify a point in the repos history, by SHA, branch
15
+ or tag (default is current master).
16
+
17
+ Given that code files tend to be small, and that github does not support
18
+ retrieving partial content, we always fetch whole files.
19
+
20
+ When using fsspec.open, allows URIs of the form:
21
+
22
+ - "github://path/file", in which case you must specify org, repo and
23
+ may specify sha in the extra args
24
+ - 'github://org:repo@/precip/catalog.yml', where the org and repo are
25
+ part of the URI
26
+ - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
27
+
28
+ ``sha`` can be the full or abbreviated hex of the commit you want to fetch
29
+ from, or a branch or tag name (so long as it doesn't contain special characters
30
+ like "/", "?", which would have to be HTTP-encoded).
31
+
32
+ For authorised access, you must provide username and token, which can be made
33
+ at https://github.com/settings/tokens
34
+ """
35
+
36
+ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
37
+ rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
38
+ protocol = "github"
39
+
40
+ def __init__(self, org, repo, sha=None, username=None, token=None, **kwargs):
41
+ super().__init__(**kwargs)
42
+ self.org = org
43
+ self.repo = repo
44
+ if (username is None) ^ (token is None):
45
+ raise ValueError("Auth required both username and token")
46
+ self.username = username
47
+ self.token = token
48
+ if sha is None:
49
+ # look up default branch (not necessarily "master")
50
+ u = "https://api.github.com/repos/{org}/{repo}"
51
+ r = requests.get(u.format(org=org, repo=repo), **self.kw)
52
+ r.raise_for_status()
53
+ sha = r.json()["default_branch"]
54
+
55
+ self.root = sha
56
+ self.ls("")
57
+
58
+ @property
59
+ def kw(self):
60
+ if self.username:
61
+ return {"auth": (self.username, self.token)}
62
+ return {}
63
+
64
+ @classmethod
65
+ def repos(cls, org_or_user, is_org=True):
66
+ """List repo names for given org or user
67
+
68
+ This may become the top level of the FS
69
+
70
+ Parameters
71
+ ----------
72
+ org_or_user: str
73
+ Name of the github org or user to query
74
+ is_org: bool (default True)
75
+ Whether the name is an organisation (True) or user (False)
76
+
77
+ Returns
78
+ -------
79
+ List of string
80
+ """
81
+ r = requests.get(
82
+ f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos"
83
+ )
84
+ r.raise_for_status()
85
+ return [repo["name"] for repo in r.json()]
86
+
87
+ @property
88
+ def tags(self):
89
+ """Names of tags in the repo"""
90
+ r = requests.get(
91
+ f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
92
+ **self.kw,
93
+ )
94
+ r.raise_for_status()
95
+ return [t["name"] for t in r.json()]
96
+
97
+ @property
98
+ def branches(self):
99
+ """Names of branches in the repo"""
100
+ r = requests.get(
101
+ f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
102
+ **self.kw,
103
+ )
104
+ r.raise_for_status()
105
+ return [t["name"] for t in r.json()]
106
+
107
+ @property
108
+ def refs(self):
109
+ """Named references, tags and branches"""
110
+ return {"tags": self.tags, "branches": self.branches}
111
+
112
+ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
113
+ """List files at given path
114
+
115
+ Parameters
116
+ ----------
117
+ path: str
118
+ Location to list, relative to repo root
119
+ detail: bool
120
+ If True, returns list of dicts, one per file; if False, returns
121
+ list of full filenames only
122
+ sha: str (optional)
123
+ List at the given point in the repo history, branch or tag name or commit
124
+ SHA
125
+ _sha: str (optional)
126
+ List this specific tree object (used internally to descend into trees)
127
+ """
128
+ path = self._strip_protocol(path)
129
+ if path == "":
130
+ _sha = sha or self.root
131
+ if _sha is None:
132
+ parts = path.rstrip("/").split("/")
133
+ so_far = ""
134
+ _sha = sha or self.root
135
+ for part in parts:
136
+ out = self.ls(so_far, True, sha=sha, _sha=_sha)
137
+ so_far += "/" + part if so_far else part
138
+ out = [o for o in out if o["name"] == so_far]
139
+ if not out:
140
+ raise FileNotFoundError(path)
141
+ out = out[0]
142
+ if out["type"] == "file":
143
+ if detail:
144
+ return [out]
145
+ else:
146
+ return path
147
+ _sha = out["sha"]
148
+ if path not in self.dircache or sha not in [self.root, None]:
149
+ r = requests.get(
150
+ self.url.format(org=self.org, repo=self.repo, sha=_sha), **self.kw
151
+ )
152
+ if r.status_code == 404:
153
+ raise FileNotFoundError(path)
154
+ r.raise_for_status()
155
+ types = {"blob": "file", "tree": "directory"}
156
+ out = [
157
+ {
158
+ "name": path + "/" + f["path"] if path else f["path"],
159
+ "mode": f["mode"],
160
+ "type": types[f["type"]],
161
+ "size": f.get("size", 0),
162
+ "sha": f["sha"],
163
+ }
164
+ for f in r.json()["tree"]
165
+ if f["type"] in types
166
+ ]
167
+ if sha in [self.root, None]:
168
+ self.dircache[path] = out
169
+ else:
170
+ out = self.dircache[path]
171
+ if detail:
172
+ return out
173
+ else:
174
+ return sorted([f["name"] for f in out])
175
+
176
+ def invalidate_cache(self, path=None):
177
+ self.dircache.clear()
178
+
179
+ @classmethod
180
+ def _strip_protocol(cls, path):
181
+ opts = infer_storage_options(path)
182
+ if "username" not in opts:
183
+ return super()._strip_protocol(path)
184
+ return opts["path"].lstrip("/")
185
+
186
+ @staticmethod
187
+ def _get_kwargs_from_urls(path):
188
+ opts = infer_storage_options(path)
189
+ if "username" not in opts:
190
+ return {}
191
+ out = {"org": opts["username"], "repo": opts["password"]}
192
+ if opts["host"]:
193
+ out["sha"] = opts["host"]
194
+ return out
195
+
196
+ def _open(
197
+ self,
198
+ path,
199
+ mode="rb",
200
+ block_size=None,
201
+ autocommit=True,
202
+ cache_options=None,
203
+ sha=None,
204
+ **kwargs,
205
+ ):
206
+ if mode != "rb":
207
+ raise NotImplementedError
208
+ url = self.rurl.format(
209
+ org=self.org, repo=self.repo, path=path, sha=sha or self.root
210
+ )
211
+ r = requests.get(url, **self.kw)
212
+ if r.status_code == 404:
213
+ raise FileNotFoundError(path)
214
+ r.raise_for_status()
215
+ return MemoryFile(None, None, r.content)
lib/python3.11/site-packages/fsspec/implementations/http.py ADDED
@@ -0,0 +1,864 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import io
3
+ import logging
4
+ import re
5
+ import weakref
6
+ from copy import copy
7
+ from urllib.parse import urlparse
8
+
9
+ import aiohttp
10
+ import requests
11
+ import yarl
12
+
13
+ from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
14
+ from fsspec.callbacks import _DEFAULT_CALLBACK
15
+ from fsspec.exceptions import FSTimeoutError
16
+ from fsspec.spec import AbstractBufferedFile
17
+ from fsspec.utils import (
18
+ DEFAULT_BLOCK_SIZE,
19
+ glob_translate,
20
+ isfilelike,
21
+ nullcontext,
22
+ tokenize,
23
+ )
24
+
25
+ from ..caching import AllBytes
26
+
27
+ # https://stackoverflow.com/a/15926317/3821154
28
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
29
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
30
+ logger = logging.getLogger("fsspec.http")
31
+
32
+
33
+ async def get_client(**kwargs):
34
+ return aiohttp.ClientSession(**kwargs)
35
+
36
+
37
+ class HTTPFileSystem(AsyncFileSystem):
38
+ """
39
+ Simple File-System for fetching data via HTTP(S)
40
+
41
+ ``ls()`` is implemented by loading the parent page and doing a regex
42
+ match on the result. If simple_link=True, anything of the form
43
+ "http(s)://server.com/stuff?thing=other"; otherwise only links within
44
+ HTML href tags will be used.
45
+ """
46
+
47
+ sep = "/"
48
+
49
+ def __init__(
50
+ self,
51
+ simple_links=True,
52
+ block_size=None,
53
+ same_scheme=True,
54
+ size_policy=None,
55
+ cache_type="bytes",
56
+ cache_options=None,
57
+ asynchronous=False,
58
+ loop=None,
59
+ client_kwargs=None,
60
+ get_client=get_client,
61
+ encoded=False,
62
+ **storage_options,
63
+ ):
64
+ """
65
+ NB: if this is called async, you must await set_client
66
+
67
+ Parameters
68
+ ----------
69
+ block_size: int
70
+ Blocks to read bytes; if 0, will default to raw requests file-like
71
+ objects instead of HTTPFile instances
72
+ simple_links: bool
73
+ If True, will consider both HTML <a> tags and anything that looks
74
+ like a URL; if False, will consider only the former.
75
+ same_scheme: True
76
+ When doing ls/glob, if this is True, only consider paths that have
77
+ http/https matching the input URLs.
78
+ size_policy: this argument is deprecated
79
+ client_kwargs: dict
80
+ Passed to aiohttp.ClientSession, see
81
+ https://docs.aiohttp.org/en/stable/client_reference.html
82
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
83
+ get_client: Callable[..., aiohttp.ClientSession]
84
+ A callable which takes keyword arguments and constructs
85
+ an aiohttp.ClientSession. It's state will be managed by
86
+ the HTTPFileSystem class.
87
+ storage_options: key-value
88
+ Any other parameters passed on to requests
89
+ cache_type, cache_options: defaults used in open
90
+ """
91
+ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
92
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
93
+ self.simple_links = simple_links
94
+ self.same_schema = same_scheme
95
+ self.cache_type = cache_type
96
+ self.cache_options = cache_options
97
+ self.client_kwargs = client_kwargs or {}
98
+ self.get_client = get_client
99
+ self.encoded = encoded
100
+ self.kwargs = storage_options
101
+ self._session = None
102
+
103
+ # Clean caching-related parameters from `storage_options`
104
+ # before propagating them as `request_options` through `self.kwargs`.
105
+ # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
106
+ # it clearer.
107
+ request_options = copy(storage_options)
108
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
109
+ request_options.pop("listings_expiry_time", None)
110
+ request_options.pop("max_paths", None)
111
+ request_options.pop("skip_instance_cache", None)
112
+ self.kwargs = request_options
113
+
114
+ @property
115
+ def fsid(self):
116
+ return "http"
117
+
118
+ def encode_url(self, url):
119
+ return yarl.URL(url, encoded=self.encoded)
120
+
121
+ @staticmethod
122
+ def close_session(loop, session):
123
+ if loop is not None and loop.is_running():
124
+ try:
125
+ sync(loop, session.close, timeout=0.1)
126
+ return
127
+ except (TimeoutError, FSTimeoutError):
128
+ pass
129
+ connector = getattr(session, "_connector", None)
130
+ if connector is not None:
131
+ # close after loop is dead
132
+ connector._close()
133
+
134
+ async def set_session(self):
135
+ if self._session is None:
136
+ self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
137
+ if not self.asynchronous:
138
+ weakref.finalize(self, self.close_session, self.loop, self._session)
139
+ return self._session
140
+
141
+ @classmethod
142
+ def _strip_protocol(cls, path):
143
+ """For HTTP, we always want to keep the full URL"""
144
+ return path
145
+
146
+ @classmethod
147
+ def _parent(cls, path):
148
+ # override, since _strip_protocol is different for URLs
149
+ par = super()._parent(path)
150
+ if len(par) > 7: # "http://..."
151
+ return par
152
+ return ""
153
+
154
+ async def _ls_real(self, url, detail=True, **kwargs):
155
+ # ignoring URL-encoded arguments
156
+ kw = self.kwargs.copy()
157
+ kw.update(kwargs)
158
+ logger.debug(url)
159
+ session = await self.set_session()
160
+ async with session.get(self.encode_url(url), **self.kwargs) as r:
161
+ self._raise_not_found_for_status(r, url)
162
+ text = await r.text()
163
+ if self.simple_links:
164
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
165
+ else:
166
+ links = [u[2] for u in ex.findall(text)]
167
+ out = set()
168
+ parts = urlparse(url)
169
+ for l in links:
170
+ if isinstance(l, tuple):
171
+ l = l[1]
172
+ if l.startswith("/") and len(l) > 1:
173
+ # absolute URL on this server
174
+ l = f"{parts.scheme}://{parts.netloc}{l}"
175
+ if l.startswith("http"):
176
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
177
+ out.add(l)
178
+ elif l.replace("https", "http").startswith(
179
+ url.replace("https", "http").rstrip("/") + "/"
180
+ ):
181
+ # allowed to cross http <-> https
182
+ out.add(l)
183
+ else:
184
+ if l not in ["..", "../"]:
185
+ # Ignore FTP-like "parent"
186
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
187
+ if not out and url.endswith("/"):
188
+ out = await self._ls_real(url.rstrip("/"), detail=False)
189
+ if detail:
190
+ return [
191
+ {
192
+ "name": u,
193
+ "size": None,
194
+ "type": "directory" if u.endswith("/") else "file",
195
+ }
196
+ for u in out
197
+ ]
198
+ else:
199
+ return sorted(out)
200
+
201
+ async def _ls(self, url, detail=True, **kwargs):
202
+ if self.use_listings_cache and url in self.dircache:
203
+ out = self.dircache[url]
204
+ else:
205
+ out = await self._ls_real(url, detail=detail, **kwargs)
206
+ self.dircache[url] = out
207
+ return out
208
+
209
+ ls = sync_wrapper(_ls)
210
+
211
+ def _raise_not_found_for_status(self, response, url):
212
+ """
213
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
214
+ """
215
+ if response.status == 404:
216
+ raise FileNotFoundError(url)
217
+ response.raise_for_status()
218
+
219
+ async def _cat_file(self, url, start=None, end=None, **kwargs):
220
+ kw = self.kwargs.copy()
221
+ kw.update(kwargs)
222
+ logger.debug(url)
223
+
224
+ if start is not None or end is not None:
225
+ if start == end:
226
+ return b""
227
+ headers = kw.pop("headers", {}).copy()
228
+
229
+ headers["Range"] = await self._process_limits(url, start, end)
230
+ kw["headers"] = headers
231
+ session = await self.set_session()
232
+ async with session.get(self.encode_url(url), **kw) as r:
233
+ out = await r.read()
234
+ self._raise_not_found_for_status(r, url)
235
+ return out
236
+
237
+ async def _get_file(
238
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
239
+ ):
240
+ kw = self.kwargs.copy()
241
+ kw.update(kwargs)
242
+ logger.debug(rpath)
243
+ session = await self.set_session()
244
+ async with session.get(self.encode_url(rpath), **kw) as r:
245
+ try:
246
+ size = int(r.headers["content-length"])
247
+ except (ValueError, KeyError):
248
+ size = None
249
+
250
+ callback.set_size(size)
251
+ self._raise_not_found_for_status(r, rpath)
252
+ if isfilelike(lpath):
253
+ outfile = lpath
254
+ else:
255
+ outfile = open(lpath, "wb")
256
+
257
+ try:
258
+ chunk = True
259
+ while chunk:
260
+ chunk = await r.content.read(chunk_size)
261
+ outfile.write(chunk)
262
+ callback.relative_update(len(chunk))
263
+ finally:
264
+ if not isfilelike(lpath):
265
+ outfile.close()
266
+
267
+ async def _put_file(
268
+ self,
269
+ lpath,
270
+ rpath,
271
+ chunk_size=5 * 2**20,
272
+ callback=_DEFAULT_CALLBACK,
273
+ method="post",
274
+ **kwargs,
275
+ ):
276
+ async def gen_chunks():
277
+ # Support passing arbitrary file-like objects
278
+ # and use them instead of streams.
279
+ if isinstance(lpath, io.IOBase):
280
+ context = nullcontext(lpath)
281
+ use_seek = False # might not support seeking
282
+ else:
283
+ context = open(lpath, "rb")
284
+ use_seek = True
285
+
286
+ with context as f:
287
+ if use_seek:
288
+ callback.set_size(f.seek(0, 2))
289
+ f.seek(0)
290
+ else:
291
+ callback.set_size(getattr(f, "size", None))
292
+
293
+ chunk = f.read(chunk_size)
294
+ while chunk:
295
+ yield chunk
296
+ callback.relative_update(len(chunk))
297
+ chunk = f.read(chunk_size)
298
+
299
+ kw = self.kwargs.copy()
300
+ kw.update(kwargs)
301
+ session = await self.set_session()
302
+
303
+ method = method.lower()
304
+ if method not in ("post", "put"):
305
+ raise ValueError(
306
+ f"method has to be either 'post' or 'put', not: {method!r}"
307
+ )
308
+
309
+ meth = getattr(session, method)
310
+ async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
311
+ self._raise_not_found_for_status(resp, rpath)
312
+
313
+ async def _exists(self, path, **kwargs):
314
+ kw = self.kwargs.copy()
315
+ kw.update(kwargs)
316
+ try:
317
+ logger.debug(path)
318
+ session = await self.set_session()
319
+ r = await session.get(self.encode_url(path), **kw)
320
+ async with r:
321
+ return r.status < 400
322
+ except (requests.HTTPError, aiohttp.ClientError):
323
+ return False
324
+
325
+ async def _isfile(self, path, **kwargs):
326
+ return await self._exists(path, **kwargs)
327
+
328
+ def _open(
329
+ self,
330
+ path,
331
+ mode="rb",
332
+ block_size=None,
333
+ autocommit=None, # XXX: This differs from the base class.
334
+ cache_type=None,
335
+ cache_options=None,
336
+ size=None,
337
+ **kwargs,
338
+ ):
339
+ """Make a file-like object
340
+
341
+ Parameters
342
+ ----------
343
+ path: str
344
+ Full URL with protocol
345
+ mode: string
346
+ must be "rb"
347
+ block_size: int or None
348
+ Bytes to download in one request; use instance value if None. If
349
+ zero, will return a streaming Requests file-like instance.
350
+ kwargs: key-value
351
+ Any other parameters, passed to requests calls
352
+ """
353
+ if mode != "rb":
354
+ raise NotImplementedError
355
+ block_size = block_size if block_size is not None else self.block_size
356
+ kw = self.kwargs.copy()
357
+ kw["asynchronous"] = self.asynchronous
358
+ kw.update(kwargs)
359
+ size = size or self.info(path, **kwargs)["size"]
360
+ session = sync(self.loop, self.set_session)
361
+ if block_size and size:
362
+ return HTTPFile(
363
+ self,
364
+ path,
365
+ session=session,
366
+ block_size=block_size,
367
+ mode=mode,
368
+ size=size,
369
+ cache_type=cache_type or self.cache_type,
370
+ cache_options=cache_options or self.cache_options,
371
+ loop=self.loop,
372
+ **kw,
373
+ )
374
+ else:
375
+ return HTTPStreamFile(
376
+ self,
377
+ path,
378
+ mode=mode,
379
+ loop=self.loop,
380
+ session=session,
381
+ **kw,
382
+ )
383
+
384
+ async def open_async(self, path, mode="rb", size=None, **kwargs):
385
+ session = await self.set_session()
386
+ if size is None:
387
+ try:
388
+ size = (await self._info(path, **kwargs))["size"]
389
+ except FileNotFoundError:
390
+ pass
391
+ return AsyncStreamFile(
392
+ self,
393
+ path,
394
+ loop=self.loop,
395
+ session=session,
396
+ size=size,
397
+ **kwargs,
398
+ )
399
+
400
+ def ukey(self, url):
401
+ """Unique identifier; assume HTTP files are static, unchanging"""
402
+ return tokenize(url, self.kwargs, self.protocol)
403
+
404
+ async def _info(self, url, **kwargs):
405
+ """Get info of URL
406
+
407
+ Tries to access location via HEAD, and then GET methods, but does
408
+ not fetch the data.
409
+
410
+ It is possible that the server does not supply any size information, in
411
+ which case size will be given as None (and certain operations on the
412
+ corresponding file will not work).
413
+ """
414
+ info = {}
415
+ session = await self.set_session()
416
+
417
+ for policy in ["head", "get"]:
418
+ try:
419
+ info.update(
420
+ await _file_info(
421
+ self.encode_url(url),
422
+ size_policy=policy,
423
+ session=session,
424
+ **self.kwargs,
425
+ **kwargs,
426
+ )
427
+ )
428
+ if info.get("size") is not None:
429
+ break
430
+ except Exception as exc:
431
+ if policy == "get":
432
+ # If get failed, then raise a FileNotFoundError
433
+ raise FileNotFoundError(url) from exc
434
+ logger.debug(str(exc))
435
+
436
+ return {"name": url, "size": None, **info, "type": "file"}
437
+
438
+ async def _glob(self, path, maxdepth=None, **kwargs):
439
+ """
440
+ Find files by glob-matching.
441
+
442
+ This implementation is idntical to the one in AbstractFileSystem,
443
+ but "?" is not considered as a character for globbing, because it is
444
+ so common in URLs, often identifying the "query" part.
445
+ """
446
+ if maxdepth is not None and maxdepth < 1:
447
+ raise ValueError("maxdepth must be at least 1")
448
+ import re
449
+
450
+ ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
451
+ path = self._strip_protocol(path)
452
+ append_slash_to_dirname = ends_with_slash or path.endswith("/**")
453
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
454
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
455
+
456
+ min_idx = min(idx_star, idx_brace)
457
+
458
+ detail = kwargs.pop("detail", False)
459
+
460
+ if not has_magic(path):
461
+ if await self._exists(path, **kwargs):
462
+ if not detail:
463
+ return [path]
464
+ else:
465
+ return {path: await self._info(path, **kwargs)}
466
+ else:
467
+ if not detail:
468
+ return [] # glob of non-existent returns empty
469
+ else:
470
+ return {}
471
+ elif "/" in path[:min_idx]:
472
+ min_idx = path[:min_idx].rindex("/")
473
+ root = path[: min_idx + 1]
474
+ depth = path[min_idx + 1 :].count("/") + 1
475
+ else:
476
+ root = ""
477
+ depth = path[min_idx + 1 :].count("/") + 1
478
+
479
+ if "**" in path:
480
+ if maxdepth is not None:
481
+ idx_double_stars = path.find("**")
482
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
483
+ depth = depth - depth_double_stars + maxdepth
484
+ else:
485
+ depth = None
486
+
487
+ allpaths = await self._find(
488
+ root, maxdepth=depth, withdirs=True, detail=True, **kwargs
489
+ )
490
+
491
+ pattern = glob_translate(path + ("/" if ends_with_slash else ""))
492
+ pattern = re.compile(pattern)
493
+
494
+ out = {
495
+ p: info
496
+ for p, info in sorted(allpaths.items())
497
+ if pattern.match(
498
+ (
499
+ p + "/"
500
+ if append_slash_to_dirname and info["type"] == "directory"
501
+ else p
502
+ )
503
+ )
504
+ }
505
+
506
+ if detail:
507
+ return out
508
+ else:
509
+ return list(out)
510
+
511
+ async def _isdir(self, path):
512
+ # override, since all URLs are (also) files
513
+ try:
514
+ return bool(await self._ls(path))
515
+ except (FileNotFoundError, ValueError):
516
+ return False
517
+
518
+
519
+ class HTTPFile(AbstractBufferedFile):
520
+ """
521
+ A file-like object pointing to a remove HTTP(S) resource
522
+
523
+ Supports only reading, with read-ahead of a predermined block-size.
524
+
525
+ In the case that the server does not supply the filesize, only reading of
526
+ the complete file in one go is supported.
527
+
528
+ Parameters
529
+ ----------
530
+ url: str
531
+ Full URL of the remote resource, including the protocol
532
+ session: requests.Session or None
533
+ All calls will be made within this session, to avoid restarting
534
+ connections where the server allows this
535
+ block_size: int or None
536
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
537
+ configured for the FileSystem creating this file
538
+ size: None or int
539
+ If given, this is the size of the file in bytes, and we don't attempt
540
+ to call the server to find the value.
541
+ kwargs: all other key-values are passed to requests calls.
542
+ """
543
+
544
+ def __init__(
545
+ self,
546
+ fs,
547
+ url,
548
+ session=None,
549
+ block_size=None,
550
+ mode="rb",
551
+ cache_type="bytes",
552
+ cache_options=None,
553
+ size=None,
554
+ loop=None,
555
+ asynchronous=False,
556
+ **kwargs,
557
+ ):
558
+ if mode != "rb":
559
+ raise NotImplementedError("File mode not supported")
560
+ self.asynchronous = asynchronous
561
+ self.url = url
562
+ self.session = session
563
+ self.details = {"name": url, "size": size, "type": "file"}
564
+ super().__init__(
565
+ fs=fs,
566
+ path=url,
567
+ mode=mode,
568
+ block_size=block_size,
569
+ cache_type=cache_type,
570
+ cache_options=cache_options,
571
+ **kwargs,
572
+ )
573
+ self.loop = loop
574
+
575
+ def read(self, length=-1):
576
+ """Read bytes from file
577
+
578
+ Parameters
579
+ ----------
580
+ length: int
581
+ Read up to this many bytes. If negative, read all content to end of
582
+ file. If the server has not supplied the filesize, attempting to
583
+ read only part of the data will raise a ValueError.
584
+ """
585
+ if (
586
+ (length < 0 and self.loc == 0) # explicit read all
587
+ # but not when the size is known and fits into a block anyways
588
+ and not (self.size is not None and self.size <= self.blocksize)
589
+ ):
590
+ self._fetch_all()
591
+ if self.size is None:
592
+ if length < 0:
593
+ self._fetch_all()
594
+ else:
595
+ length = min(self.size - self.loc, length)
596
+ return super().read(length)
597
+
598
+ async def async_fetch_all(self):
599
+ """Read whole file in one shot, without caching
600
+
601
+ This is only called when position is still at zero,
602
+ and read() is called without a byte-count.
603
+ """
604
+ logger.debug(f"Fetch all for {self}")
605
+ if not isinstance(self.cache, AllBytes):
606
+ r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
607
+ async with r:
608
+ r.raise_for_status()
609
+ out = await r.read()
610
+ self.cache = AllBytes(
611
+ size=len(out), fetcher=None, blocksize=None, data=out
612
+ )
613
+ self.size = len(out)
614
+
615
+ _fetch_all = sync_wrapper(async_fetch_all)
616
+
617
+ def _parse_content_range(self, headers):
618
+ """Parse the Content-Range header"""
619
+ s = headers.get("Content-Range", "")
620
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
621
+ if not m:
622
+ return None, None, None
623
+
624
+ if m[1] == "*":
625
+ start = end = None
626
+ else:
627
+ start, end = [int(x) for x in m[1].split("-")]
628
+ total = None if m[2] == "*" else int(m[2])
629
+ return start, end, total
630
+
631
+ async def async_fetch_range(self, start, end):
632
+ """Download a block of data
633
+
634
+ The expectation is that the server returns only the requested bytes,
635
+ with HTTP code 206. If this is not the case, we first check the headers,
636
+ and then stream the output - if the data size is bigger than we
637
+ requested, an exception is raised.
638
+ """
639
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
640
+ kwargs = self.kwargs.copy()
641
+ headers = kwargs.pop("headers", {}).copy()
642
+ headers["Range"] = f"bytes={start}-{end - 1}"
643
+ logger.debug(f"{self.url} : {headers['Range']}")
644
+ r = await self.session.get(
645
+ self.fs.encode_url(self.url), headers=headers, **kwargs
646
+ )
647
+ async with r:
648
+ if r.status == 416:
649
+ # range request outside file
650
+ return b""
651
+ r.raise_for_status()
652
+
653
+ # If the server has handled the range request, it should reply
654
+ # with status 206 (partial content). But we'll guess that a suitable
655
+ # Content-Range header or a Content-Length no more than the
656
+ # requested range also mean we have got the desired range.
657
+ response_is_range = (
658
+ r.status == 206
659
+ or self._parse_content_range(r.headers)[0] == start
660
+ or int(r.headers.get("Content-Length", end + 1)) <= end - start
661
+ )
662
+
663
+ if response_is_range:
664
+ # partial content, as expected
665
+ out = await r.read()
666
+ elif start > 0:
667
+ raise ValueError(
668
+ "The HTTP server doesn't appear to support range requests. "
669
+ "Only reading this file from the beginning is supported. "
670
+ "Open with block_size=0 for a streaming file interface."
671
+ )
672
+ else:
673
+ # Response is not a range, but we want the start of the file,
674
+ # so we can read the required amount anyway.
675
+ cl = 0
676
+ out = []
677
+ while True:
678
+ chunk = await r.content.read(2**20)
679
+ # data size unknown, let's read until we have enough
680
+ if chunk:
681
+ out.append(chunk)
682
+ cl += len(chunk)
683
+ if cl > end - start:
684
+ break
685
+ else:
686
+ break
687
+ out = b"".join(out)[: end - start]
688
+ return out
689
+
690
+ _fetch_range = sync_wrapper(async_fetch_range)
691
+
692
+ def __reduce__(self):
693
+ return (
694
+ reopen,
695
+ (
696
+ self.fs,
697
+ self.url,
698
+ self.mode,
699
+ self.blocksize,
700
+ self.cache.name if self.cache else "none",
701
+ self.size,
702
+ ),
703
+ )
704
+
705
+
706
+ def reopen(fs, url, mode, blocksize, cache_type, size=None):
707
+ return fs.open(
708
+ url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
709
+ )
710
+
711
+
712
+ magic_check = re.compile("([*[])")
713
+
714
+
715
+ def has_magic(s):
716
+ match = magic_check.search(s)
717
+ return match is not None
718
+
719
+
720
+ class HTTPStreamFile(AbstractBufferedFile):
721
+ def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
722
+ self.asynchronous = kwargs.pop("asynchronous", False)
723
+ self.url = url
724
+ self.loop = loop
725
+ self.session = session
726
+ if mode != "rb":
727
+ raise ValueError
728
+ self.details = {"name": url, "size": None}
729
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
730
+
731
+ async def cor():
732
+ r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
733
+ self.fs._raise_not_found_for_status(r, url)
734
+ return r
735
+
736
+ self.r = sync(self.loop, cor)
737
+
738
+ def seek(self, loc, whence=0):
739
+ if loc == 0 and whence == 1:
740
+ return
741
+ if loc == self.loc and whence == 0:
742
+ return
743
+ raise ValueError("Cannot seek streaming HTTP file")
744
+
745
+ async def _read(self, num=-1):
746
+ out = await self.r.content.read(num)
747
+ self.loc += len(out)
748
+ return out
749
+
750
+ read = sync_wrapper(_read)
751
+
752
+ async def _close(self):
753
+ self.r.close()
754
+
755
+ def close(self):
756
+ asyncio.run_coroutine_threadsafe(self._close(), self.loop)
757
+ super().close()
758
+
759
+ def __reduce__(self):
760
+ return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
761
+
762
+
763
+ class AsyncStreamFile(AbstractAsyncStreamedFile):
764
+ def __init__(
765
+ self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
766
+ ):
767
+ self.url = url
768
+ self.session = session
769
+ self.r = None
770
+ if mode != "rb":
771
+ raise ValueError
772
+ self.details = {"name": url, "size": None}
773
+ self.kwargs = kwargs
774
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
775
+ self.size = size
776
+
777
+ async def read(self, num=-1):
778
+ if self.r is None:
779
+ r = await self.session.get(
780
+ self.fs.encode_url(self.url), **self.kwargs
781
+ ).__aenter__()
782
+ self.fs._raise_not_found_for_status(r, self.url)
783
+ self.r = r
784
+ out = await self.r.content.read(num)
785
+ self.loc += len(out)
786
+ return out
787
+
788
+ async def close(self):
789
+ if self.r is not None:
790
+ self.r.close()
791
+ self.r = None
792
+ await super().close()
793
+
794
+
795
+ async def get_range(session, url, start, end, file=None, **kwargs):
796
+ # explicit get a range when we know it must be safe
797
+ kwargs = kwargs.copy()
798
+ headers = kwargs.pop("headers", {}).copy()
799
+ headers["Range"] = f"bytes={start}-{end - 1}"
800
+ r = await session.get(url, headers=headers, **kwargs)
801
+ r.raise_for_status()
802
+ async with r:
803
+ out = await r.read()
804
+ if file:
805
+ with open(file, "r+b") as f:
806
+ f.seek(start)
807
+ f.write(out)
808
+ else:
809
+ return out
810
+
811
+
812
+ async def _file_info(url, session, size_policy="head", **kwargs):
813
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
814
+
815
+ Default operation is to explicitly allow redirects and use encoding
816
+ 'identity' (no compression) to get the true size of the target.
817
+ """
818
+ logger.debug("Retrieve file size for %s", url)
819
+ kwargs = kwargs.copy()
820
+ ar = kwargs.pop("allow_redirects", True)
821
+ head = kwargs.get("headers", {}).copy()
822
+ head["Accept-Encoding"] = "identity"
823
+ kwargs["headers"] = head
824
+
825
+ info = {}
826
+ if size_policy == "head":
827
+ r = await session.head(url, allow_redirects=ar, **kwargs)
828
+ elif size_policy == "get":
829
+ r = await session.get(url, allow_redirects=ar, **kwargs)
830
+ else:
831
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
832
+ async with r:
833
+ r.raise_for_status()
834
+
835
+ # TODO:
836
+ # recognise lack of 'Accept-Ranges',
837
+ # or 'Accept-Ranges': 'none' (not 'bytes')
838
+ # to mean streaming only, no random access => return None
839
+ if "Content-Length" in r.headers:
840
+ # Some servers may choose to ignore Accept-Encoding and return
841
+ # compressed content, in which case the returned size is unreliable.
842
+ if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
843
+ "identity",
844
+ "",
845
+ ]:
846
+ info["size"] = int(r.headers["Content-Length"])
847
+ elif "Content-Range" in r.headers:
848
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
849
+
850
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
851
+ if r.headers.get(checksum_field):
852
+ info[checksum_field] = r.headers[checksum_field]
853
+
854
+ return info
855
+
856
+
857
+ async def _file_size(url, session=None, *args, **kwargs):
858
+ if session is None:
859
+ session = await get_client()
860
+ info = await _file_info(url, session=session, *args, **kwargs)
861
+ return info.get("size")
862
+
863
+
864
+ file_size = sync_wrapper(_file_size)
lib/python3.11/site-packages/fsspec/implementations/jupyter.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import re
4
+
5
+ import requests
6
+
7
+ import fsspec
8
+
9
+
10
+ class JupyterFileSystem(fsspec.AbstractFileSystem):
11
+ """View of the files as seen by a Jupyter server (notebook or lab)"""
12
+
13
+ protocol = ("jupyter", "jlab")
14
+
15
+ def __init__(self, url, tok=None, **kwargs):
16
+ """
17
+
18
+ Parameters
19
+ ----------
20
+ url : str
21
+ Base URL of the server, like "http://127.0.0.1:8888". May include
22
+ token in the string, which is given by the process when starting up
23
+ tok : str
24
+ If the token is obtained separately, can be given here
25
+ kwargs
26
+ """
27
+ if "?" in url:
28
+ if tok is None:
29
+ try:
30
+ tok = re.findall("token=([a-z0-9]+)", url)[0]
31
+ except IndexError as e:
32
+ raise ValueError("Could not determine token") from e
33
+ url = url.split("?", 1)[0]
34
+ self.url = url.rstrip("/") + "/api/contents"
35
+ self.session = requests.Session()
36
+ if tok:
37
+ self.session.headers["Authorization"] = f"token {tok}"
38
+
39
+ super().__init__(**kwargs)
40
+
41
+ def ls(self, path, detail=True, **kwargs):
42
+ path = self._strip_protocol(path)
43
+ r = self.session.get(f"{self.url}/{path}")
44
+ if r.status_code == 404:
45
+ return FileNotFoundError(path)
46
+ r.raise_for_status()
47
+ out = r.json()
48
+
49
+ if out["type"] == "directory":
50
+ out = out["content"]
51
+ else:
52
+ out = [out]
53
+ for o in out:
54
+ o["name"] = o.pop("path")
55
+ o.pop("content")
56
+ if o["type"] == "notebook":
57
+ o["type"] = "file"
58
+ if detail:
59
+ return out
60
+ return [o["name"] for o in out]
61
+
62
+ def cat_file(self, path, start=None, end=None, **kwargs):
63
+ path = self._strip_protocol(path)
64
+ r = self.session.get(f"{self.url}/{path}")
65
+ if r.status_code == 404:
66
+ return FileNotFoundError(path)
67
+ r.raise_for_status()
68
+ out = r.json()
69
+ if out["format"] == "text":
70
+ # data should be binary
71
+ b = out["content"].encode()
72
+ else:
73
+ b = base64.b64decode(out["content"])
74
+ return b[start:end]
75
+
76
+ def pipe_file(self, path, value, **_):
77
+ path = self._strip_protocol(path)
78
+ json = {
79
+ "name": path.rsplit("/", 1)[-1],
80
+ "path": path,
81
+ "size": len(value),
82
+ "content": base64.b64encode(value).decode(),
83
+ "format": "base64",
84
+ "type": "file",
85
+ }
86
+ self.session.put(f"{self.url}/{path}", json=json)
87
+
88
+ def mkdir(self, path, create_parents=True, **kwargs):
89
+ path = self._strip_protocol(path)
90
+ if create_parents and "/" in path:
91
+ self.mkdir(path.rsplit("/", 1)[0], True)
92
+ json = {
93
+ "name": path.rsplit("/", 1)[-1],
94
+ "path": path,
95
+ "size": None,
96
+ "content": None,
97
+ "type": "directory",
98
+ }
99
+ self.session.put(f"{self.url}/{path}", json=json)
100
+
101
+ def _rm(self, path):
102
+ path = self._strip_protocol(path)
103
+ self.session.delete(f"{self.url}/{path}")
104
+
105
+ def _open(self, path, mode="rb", **kwargs):
106
+ path = self._strip_protocol(path)
107
+ if mode == "rb":
108
+ data = self.cat_file(path)
109
+ return io.BytesIO(data)
110
+ else:
111
+ return SimpleFileWriter(self, path, mode="wb")
112
+
113
+
114
+ class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
115
+ def _upload_chunk(self, final=False):
116
+ """Never uploads a chunk until file is done
117
+
118
+ Not suitable for large files
119
+ """
120
+ if final is False:
121
+ return False
122
+ self.buffer.seek(0)
123
+ data = self.buffer.read()
124
+ self.fs.pipe_file(self.path, data)
lib/python3.11/site-packages/fsspec/implementations/libarchive.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+ from ctypes import (
3
+ CFUNCTYPE,
4
+ POINTER,
5
+ c_int,
6
+ c_longlong,
7
+ c_void_p,
8
+ cast,
9
+ create_string_buffer,
10
+ )
11
+
12
+ import libarchive
13
+ import libarchive.ffi as ffi
14
+
15
+ from fsspec import open_files
16
+ from fsspec.archive import AbstractArchiveFileSystem
17
+ from fsspec.implementations.memory import MemoryFile
18
+ from fsspec.utils import DEFAULT_BLOCK_SIZE
19
+
20
+ # Libarchive requires seekable files or memory only for certain archive
21
+ # types. However, since we read the directory first to cache the contents
22
+ # and also allow random access to any file, the file-like object needs
23
+ # to be seekable no matter what.
24
+
25
+ # Seek call-backs (not provided in the libarchive python wrapper)
26
+ SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
27
+ read_set_seek_callback = ffi.ffi(
28
+ "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
29
+ )
30
+ new_api = hasattr(ffi, "NO_OPEN_CB")
31
+
32
+
33
+ @contextmanager
34
+ def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
35
+ """Read an archive from a seekable file-like object.
36
+
37
+ The `file` object must support the standard `readinto` and 'seek' methods.
38
+ """
39
+ buf = create_string_buffer(block_size)
40
+ buf_p = cast(buf, c_void_p)
41
+
42
+ def read_func(archive_p, context, ptrptr):
43
+ # readinto the buffer, returns number of bytes read
44
+ length = file.readinto(buf)
45
+ # write the address of the buffer into the pointer
46
+ ptrptr = cast(ptrptr, POINTER(c_void_p))
47
+ ptrptr[0] = buf_p
48
+ # tell libarchive how much data was written into the buffer
49
+ return length
50
+
51
+ def seek_func(archive_p, context, offset, whence):
52
+ file.seek(offset, whence)
53
+ # tell libarchvie the current position
54
+ return file.tell()
55
+
56
+ read_cb = ffi.READ_CALLBACK(read_func)
57
+ seek_cb = SEEK_CALLBACK(seek_func)
58
+
59
+ if new_api:
60
+ open_cb = ffi.NO_OPEN_CB
61
+ close_cb = ffi.NO_CLOSE_CB
62
+ else:
63
+ open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
64
+ close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
65
+
66
+ with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
67
+ read_set_seek_callback(archive_p, seek_cb)
68
+ ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
69
+ yield libarchive.read.ArchiveRead(archive_p)
70
+
71
+
72
+ class LibArchiveFileSystem(AbstractArchiveFileSystem):
73
+ """Compressed archives as a file-system (read-only)
74
+
75
+ Supports the following formats:
76
+ tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
77
+ Microsoft CAB, 7-Zip, WARC
78
+
79
+ See the libarchive documentation for further restrictions.
80
+ https://www.libarchive.org/
81
+
82
+ Keeps file object open while instance lives. It only works in seekable
83
+ file-like objects. In case the filesystem does not support this kind of
84
+ file object, it is recommended to cache locally.
85
+
86
+ This class is pickleable, but not necessarily thread-safe (depends on the
87
+ platform). See libarchive documentation for details.
88
+ """
89
+
90
+ root_marker = ""
91
+ protocol = "libarchive"
92
+ cachable = False
93
+
94
+ def __init__(
95
+ self,
96
+ fo="",
97
+ mode="r",
98
+ target_protocol=None,
99
+ target_options=None,
100
+ block_size=DEFAULT_BLOCK_SIZE,
101
+ **kwargs,
102
+ ):
103
+ """
104
+ Parameters
105
+ ----------
106
+ fo: str or file-like
107
+ Contains ZIP, and must exist. If a str, will fetch file using
108
+ :meth:`~fsspec.open_files`, which must return one file exactly.
109
+ mode: str
110
+ Currently, only 'r' accepted
111
+ target_protocol: str (optional)
112
+ If ``fo`` is a string, this value can be used to override the
113
+ FS protocol inferred from a URL
114
+ target_options: dict (optional)
115
+ Kwargs passed when instantiating the target FS, if ``fo`` is
116
+ a string.
117
+ """
118
+ super().__init__(self, **kwargs)
119
+ if mode != "r":
120
+ raise ValueError("Only read from archive files accepted")
121
+ if isinstance(fo, str):
122
+ files = open_files(fo, protocol=target_protocol, **(target_options or {}))
123
+ if len(files) != 1:
124
+ raise ValueError(
125
+ f'Path "{fo}" did not resolve to exactly one file: "{files}"'
126
+ )
127
+ fo = files[0]
128
+ self.of = fo
129
+ self.fo = fo.__enter__() # the whole instance is a context
130
+ self.block_size = block_size
131
+ self.dir_cache = None
132
+
133
+ @contextmanager
134
+ def _open_archive(self):
135
+ self.fo.seek(0)
136
+ with custom_reader(self.fo, block_size=self.block_size) as arc:
137
+ yield arc
138
+
139
+ @classmethod
140
+ def _strip_protocol(cls, path):
141
+ # file paths are always relative to the archive root
142
+ return super()._strip_protocol(path).lstrip("/")
143
+
144
+ def _get_dirs(self):
145
+ fields = {
146
+ "name": "pathname",
147
+ "size": "size",
148
+ "created": "ctime",
149
+ "mode": "mode",
150
+ "uid": "uid",
151
+ "gid": "gid",
152
+ "mtime": "mtime",
153
+ }
154
+
155
+ if self.dir_cache is not None:
156
+ return
157
+
158
+ self.dir_cache = {}
159
+ list_names = []
160
+ with self._open_archive() as arc:
161
+ for entry in arc:
162
+ if not entry.isdir and not entry.isfile:
163
+ # Skip symbolic links, fifo entries, etc.
164
+ continue
165
+ self.dir_cache.update(
166
+ {
167
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
168
+ for dirname in self._all_dirnames(set(entry.name))
169
+ }
170
+ )
171
+ f = {key: getattr(entry, fields[key]) for key in fields}
172
+ f["type"] = "directory" if entry.isdir else "file"
173
+ list_names.append(entry.name)
174
+
175
+ self.dir_cache[f["name"]] = f
176
+ # libarchive does not seem to return an entry for the directories (at least
177
+ # not in all formats), so get the directories names from the files names
178
+ self.dir_cache.update(
179
+ {
180
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
181
+ for dirname in self._all_dirnames(list_names)
182
+ }
183
+ )
184
+
185
+ def _open(
186
+ self,
187
+ path,
188
+ mode="rb",
189
+ block_size=None,
190
+ autocommit=True,
191
+ cache_options=None,
192
+ **kwargs,
193
+ ):
194
+ path = self._strip_protocol(path)
195
+ if mode != "rb":
196
+ raise NotImplementedError
197
+
198
+ data = bytes()
199
+ with self._open_archive() as arc:
200
+ for entry in arc:
201
+ if entry.pathname != path:
202
+ continue
203
+
204
+ if entry.size == 0:
205
+ # empty file, so there are no blocks
206
+ break
207
+
208
+ for block in entry.get_blocks(entry.size):
209
+ data = block
210
+ break
211
+ else:
212
+ raise ValueError
213
+ return MemoryFile(fs=self, path=path, data=data)
lib/python3.11/site-packages/fsspec/implementations/local.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import io
3
+ import logging
4
+ import os
5
+ import os.path as osp
6
+ import posixpath
7
+ import re
8
+ import shutil
9
+ import stat
10
+ import tempfile
11
+
12
+ from fsspec import AbstractFileSystem
13
+ from fsspec.compression import compr
14
+ from fsspec.core import get_compression
15
+ from fsspec.utils import isfilelike, stringify_path
16
+
17
+ logger = logging.getLogger("fsspec.local")
18
+
19
+
20
+ class LocalFileSystem(AbstractFileSystem):
21
+ """Interface to files on local storage
22
+
23
+ Parameters
24
+ ----------
25
+ auto_mkdir: bool
26
+ Whether, when opening a file, the directory containing it should
27
+ be created (if it doesn't already exist). This is assumed by pyarrow
28
+ code.
29
+ """
30
+
31
+ root_marker = "/"
32
+ protocol = "file", "local"
33
+ local_file = True
34
+
35
+ def __init__(self, auto_mkdir=False, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.auto_mkdir = auto_mkdir
38
+
39
+ @property
40
+ def fsid(self):
41
+ return "local"
42
+
43
+ def mkdir(self, path, create_parents=True, **kwargs):
44
+ path = self._strip_protocol(path)
45
+ if self.exists(path):
46
+ raise FileExistsError(path)
47
+ if create_parents:
48
+ self.makedirs(path, exist_ok=True)
49
+ else:
50
+ os.mkdir(path, **kwargs)
51
+
52
+ def makedirs(self, path, exist_ok=False):
53
+ path = self._strip_protocol(path)
54
+ os.makedirs(path, exist_ok=exist_ok)
55
+
56
+ def rmdir(self, path):
57
+ path = self._strip_protocol(path)
58
+ os.rmdir(path)
59
+
60
+ def ls(self, path, detail=False, **kwargs):
61
+ path = self._strip_protocol(path)
62
+ if detail:
63
+ with os.scandir(path) as it:
64
+ return [self.info(f) for f in it]
65
+ else:
66
+ return [posixpath.join(path, f) for f in os.listdir(path)]
67
+
68
+ def info(self, path, **kwargs):
69
+ if isinstance(path, os.DirEntry):
70
+ # scandir DirEntry
71
+ out = path.stat(follow_symlinks=False)
72
+ link = path.is_symlink()
73
+ if path.is_dir(follow_symlinks=False):
74
+ t = "directory"
75
+ elif path.is_file(follow_symlinks=False):
76
+ t = "file"
77
+ else:
78
+ t = "other"
79
+ path = self._strip_protocol(path.path)
80
+ else:
81
+ # str or path-like
82
+ path = self._strip_protocol(path)
83
+ out = os.stat(path, follow_symlinks=False)
84
+ link = stat.S_ISLNK(out.st_mode)
85
+ if link:
86
+ out = os.stat(path, follow_symlinks=True)
87
+ if stat.S_ISDIR(out.st_mode):
88
+ t = "directory"
89
+ elif stat.S_ISREG(out.st_mode):
90
+ t = "file"
91
+ else:
92
+ t = "other"
93
+ result = {
94
+ "name": path,
95
+ "size": out.st_size,
96
+ "type": t,
97
+ "created": out.st_ctime,
98
+ "islink": link,
99
+ }
100
+ for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
101
+ result[field] = getattr(out, f"st_{field}")
102
+ if result["islink"]:
103
+ result["destination"] = os.readlink(path)
104
+ try:
105
+ out2 = os.stat(path, follow_symlinks=True)
106
+ result["size"] = out2.st_size
107
+ except OSError:
108
+ result["size"] = 0
109
+ return result
110
+
111
+ def lexists(self, path, **kwargs):
112
+ return osp.lexists(path)
113
+
114
+ def cp_file(self, path1, path2, **kwargs):
115
+ path1 = self._strip_protocol(path1).rstrip("/")
116
+ path2 = self._strip_protocol(path2).rstrip("/")
117
+ if self.auto_mkdir:
118
+ self.makedirs(self._parent(path2), exist_ok=True)
119
+ if self.isfile(path1):
120
+ shutil.copyfile(path1, path2)
121
+ elif self.isdir(path1):
122
+ self.mkdirs(path2, exist_ok=True)
123
+ else:
124
+ raise FileNotFoundError(path1)
125
+
126
+ def get_file(self, path1, path2, callback=None, **kwargs):
127
+ if isfilelike(path2):
128
+ with open(path1, "rb") as f:
129
+ shutil.copyfileobj(f, path2)
130
+ else:
131
+ return self.cp_file(path1, path2, **kwargs)
132
+
133
+ def put_file(self, path1, path2, callback=None, **kwargs):
134
+ return self.cp_file(path1, path2, **kwargs)
135
+
136
+ def mv_file(self, path1, path2, **kwargs):
137
+ path1 = self._strip_protocol(path1).rstrip("/")
138
+ path2 = self._strip_protocol(path2).rstrip("/")
139
+ shutil.move(path1, path2)
140
+
141
+ def link(self, src, dst, **kwargs):
142
+ src = self._strip_protocol(src)
143
+ dst = self._strip_protocol(dst)
144
+ os.link(src, dst, **kwargs)
145
+
146
+ def symlink(self, src, dst, **kwargs):
147
+ src = self._strip_protocol(src)
148
+ dst = self._strip_protocol(dst)
149
+ os.symlink(src, dst, **kwargs)
150
+
151
+ def islink(self, path) -> bool:
152
+ return os.path.islink(self._strip_protocol(path))
153
+
154
+ def rm_file(self, path):
155
+ os.remove(self._strip_protocol(path))
156
+
157
+ def rm(self, path, recursive=False, maxdepth=None):
158
+ if not isinstance(path, list):
159
+ path = [path]
160
+
161
+ for p in path:
162
+ p = self._strip_protocol(p).rstrip("/")
163
+ if self.isdir(p):
164
+ if not recursive:
165
+ raise ValueError("Cannot delete directory, set recursive=True")
166
+ if osp.abspath(p) == os.getcwd():
167
+ raise ValueError("Cannot delete current working directory")
168
+ shutil.rmtree(p)
169
+ else:
170
+ os.remove(p)
171
+
172
+ def unstrip_protocol(self, name):
173
+ name = self._strip_protocol(name) # normalise for local/win/...
174
+ return f"file://{name}"
175
+
176
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
177
+ path = self._strip_protocol(path)
178
+ if self.auto_mkdir and "w" in mode:
179
+ self.makedirs(self._parent(path), exist_ok=True)
180
+ return LocalFileOpener(path, mode, fs=self, **kwargs)
181
+
182
+ def touch(self, path, truncate=True, **kwargs):
183
+ path = self._strip_protocol(path)
184
+ if self.auto_mkdir:
185
+ self.makedirs(self._parent(path), exist_ok=True)
186
+ if self.exists(path):
187
+ os.utime(path, None)
188
+ else:
189
+ open(path, "a").close()
190
+ if truncate:
191
+ os.truncate(path, 0)
192
+
193
+ def created(self, path):
194
+ info = self.info(path=path)
195
+ return datetime.datetime.fromtimestamp(
196
+ info["created"], tz=datetime.timezone.utc
197
+ )
198
+
199
+ def modified(self, path):
200
+ info = self.info(path=path)
201
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
202
+
203
+ @classmethod
204
+ def _parent(cls, path):
205
+ path = cls._strip_protocol(path).rstrip("/")
206
+ if "/" in path:
207
+ return path.rsplit("/", 1)[0]
208
+ else:
209
+ return cls.root_marker
210
+
211
+ @classmethod
212
+ def _strip_protocol(cls, path):
213
+ path = stringify_path(path)
214
+ if path.startswith("file://"):
215
+ path = path[7:]
216
+ elif path.startswith("file:"):
217
+ path = path[5:]
218
+ elif path.startswith("local://"):
219
+ path = path[8:]
220
+ elif path.startswith("local:"):
221
+ path = path[6:]
222
+ return make_path_posix(path).rstrip("/") or cls.root_marker
223
+
224
+ def _isfilestore(self):
225
+ # Inheriting from DaskFileSystem makes this False (S3, etc. were)
226
+ # the original motivation. But we are a posix-like file system.
227
+ # See https://github.com/dask/dask/issues/5526
228
+ return True
229
+
230
+ def chmod(self, path, mode):
231
+ path = stringify_path(path)
232
+ return os.chmod(path, mode)
233
+
234
+
235
+ def make_path_posix(path, sep=os.sep):
236
+ """Make path generic"""
237
+ if isinstance(path, (list, set, tuple)):
238
+ return type(path)(make_path_posix(p) for p in path)
239
+ if "~" in path:
240
+ path = osp.expanduser(path)
241
+ if sep == "/":
242
+ # most common fast case for posix
243
+ if path.startswith("/"):
244
+ return path
245
+ if path.startswith("./"):
246
+ path = path[2:]
247
+ return f"{os.getcwd()}/{path}"
248
+ if (
249
+ (sep not in path and "/" not in path)
250
+ or (sep == "/" and not path.startswith("/"))
251
+ or (sep == "\\" and ":" not in path and not path.startswith("\\\\"))
252
+ ):
253
+ # relative path like "path" or "rel\\path" (win) or rel/path"
254
+ if os.sep == "\\":
255
+ # abspath made some more '\\' separators
256
+ return make_path_posix(osp.abspath(path))
257
+ else:
258
+ return f"{os.getcwd()}/{path}"
259
+ if path.startswith("file://"):
260
+ path = path[7:]
261
+ if re.match("/[A-Za-z]:", path):
262
+ # for windows file URI like "file:///C:/folder/file"
263
+ # or "file:///C:\\dir\\file"
264
+ path = path[1:].replace("\\", "/").replace("//", "/")
265
+ if path.startswith("\\\\"):
266
+ # special case for windows UNC/DFS-style paths, do nothing,
267
+ # just flip the slashes around (case below does not work!)
268
+ return path.replace("\\", "/")
269
+ if re.match("[A-Za-z]:", path):
270
+ # windows full path like "C:\\local\\path"
271
+ return path.lstrip("\\").replace("\\", "/").replace("//", "/")
272
+ if path.startswith("\\"):
273
+ # windows network path like "\\server\\path"
274
+ return "/" + path.lstrip("\\").replace("\\", "/").replace("//", "/")
275
+ return path
276
+
277
+
278
+ def trailing_sep(path):
279
+ """Return True if the path ends with a path separator.
280
+
281
+ A forward slash is always considered a path separator, even on Operating
282
+ Systems that normally use a backslash.
283
+ """
284
+ # TODO: if all incoming paths were posix-compliant then separator would
285
+ # always be a forward slash, simplifying this function.
286
+ # See https://github.com/fsspec/filesystem_spec/pull/1250
287
+ return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
288
+
289
+
290
+ class LocalFileOpener(io.IOBase):
291
+ def __init__(
292
+ self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
293
+ ):
294
+ logger.debug("open file: %s", path)
295
+ self.path = path
296
+ self.mode = mode
297
+ self.fs = fs
298
+ self.f = None
299
+ self.autocommit = autocommit
300
+ self.compression = get_compression(path, compression)
301
+ self.blocksize = io.DEFAULT_BUFFER_SIZE
302
+ self._open()
303
+
304
+ def _open(self):
305
+ if self.f is None or self.f.closed:
306
+ if self.autocommit or "w" not in self.mode:
307
+ self.f = open(self.path, mode=self.mode)
308
+ if self.compression:
309
+ compress = compr[self.compression]
310
+ self.f = compress(self.f, mode=self.mode)
311
+ else:
312
+ # TODO: check if path is writable?
313
+ i, name = tempfile.mkstemp()
314
+ os.close(i) # we want normal open and normal buffered file
315
+ self.temp = name
316
+ self.f = open(name, mode=self.mode)
317
+ if "w" not in self.mode:
318
+ self.size = self.f.seek(0, 2)
319
+ self.f.seek(0)
320
+ self.f.size = self.size
321
+
322
+ def _fetch_range(self, start, end):
323
+ # probably only used by cached FS
324
+ if "r" not in self.mode:
325
+ raise ValueError
326
+ self._open()
327
+ self.f.seek(start)
328
+ return self.f.read(end - start)
329
+
330
+ def __setstate__(self, state):
331
+ self.f = None
332
+ loc = state.pop("loc", None)
333
+ self.__dict__.update(state)
334
+ if "r" in state["mode"]:
335
+ self.f = None
336
+ self._open()
337
+ self.f.seek(loc)
338
+
339
+ def __getstate__(self):
340
+ d = self.__dict__.copy()
341
+ d.pop("f")
342
+ if "r" in self.mode:
343
+ d["loc"] = self.f.tell()
344
+ else:
345
+ if not self.f.closed:
346
+ raise ValueError("Cannot serialise open write-mode local file")
347
+ return d
348
+
349
+ def commit(self):
350
+ if self.autocommit:
351
+ raise RuntimeError("Can only commit if not already set to autocommit")
352
+ shutil.move(self.temp, self.path)
353
+
354
+ def discard(self):
355
+ if self.autocommit:
356
+ raise RuntimeError("Cannot discard if set to autocommit")
357
+ os.remove(self.temp)
358
+
359
+ def readable(self) -> bool:
360
+ return True
361
+
362
+ def writable(self) -> bool:
363
+ return "r" not in self.mode
364
+
365
+ def read(self, *args, **kwargs):
366
+ return self.f.read(*args, **kwargs)
367
+
368
+ def write(self, *args, **kwargs):
369
+ return self.f.write(*args, **kwargs)
370
+
371
+ def tell(self, *args, **kwargs):
372
+ return self.f.tell(*args, **kwargs)
373
+
374
+ def seek(self, *args, **kwargs):
375
+ return self.f.seek(*args, **kwargs)
376
+
377
+ def seekable(self, *args, **kwargs):
378
+ return self.f.seekable(*args, **kwargs)
379
+
380
+ def readline(self, *args, **kwargs):
381
+ return self.f.readline(*args, **kwargs)
382
+
383
+ def readlines(self, *args, **kwargs):
384
+ return self.f.readlines(*args, **kwargs)
385
+
386
+ def close(self):
387
+ return self.f.close()
388
+
389
+ def truncate(self, size=None) -> int:
390
+ return self.f.truncate(size)
391
+
392
+ @property
393
+ def closed(self):
394
+ return self.f.closed
395
+
396
+ def fileno(self):
397
+ return self.raw.fileno()
398
+
399
+ def flush(self) -> None:
400
+ self.f.flush()
401
+
402
+ def __iter__(self):
403
+ return self.f.__iter__()
404
+
405
+ def __getattr__(self, item):
406
+ return getattr(self.f, item)
407
+
408
+ def __enter__(self):
409
+ self._incontext = True
410
+ return self
411
+
412
+ def __exit__(self, exc_type, exc_value, traceback):
413
+ self._incontext = False
414
+ self.f.__exit__(exc_type, exc_value, traceback)
lib/python3.11/site-packages/fsspec/implementations/memory.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from datetime import datetime, timezone
5
+ from errno import ENOTEMPTY
6
+ from io import BytesIO
7
+ from typing import Any, ClassVar
8
+
9
+ from fsspec import AbstractFileSystem
10
+
11
+ logger = logging.Logger("fsspec.memoryfs")
12
+
13
+
14
+ class MemoryFileSystem(AbstractFileSystem):
15
+ """A filesystem based on a dict of BytesIO objects
16
+
17
+ This is a global filesystem so instances of this class all point to the same
18
+ in memory filesystem.
19
+ """
20
+
21
+ store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
22
+ pseudo_dirs = [""] # global, do not overwrite!
23
+ protocol = "memory"
24
+ root_marker = "/"
25
+
26
+ @classmethod
27
+ def _strip_protocol(cls, path):
28
+ if path.startswith("memory://"):
29
+ path = path[len("memory://") :]
30
+ if "::" in path or "://" in path:
31
+ return path.rstrip("/")
32
+ path = path.lstrip("/").rstrip("/")
33
+ return "/" + path if path else ""
34
+
35
+ def ls(self, path, detail=True, **kwargs):
36
+ path = self._strip_protocol(path)
37
+ if path in self.store:
38
+ # there is a key with this exact name
39
+ if not detail:
40
+ return [path]
41
+ return [
42
+ {
43
+ "name": path,
44
+ "size": self.store[path].size,
45
+ "type": "file",
46
+ "created": self.store[path].created.timestamp(),
47
+ }
48
+ ]
49
+ paths = set()
50
+ starter = path + "/"
51
+ out = []
52
+ for p2 in tuple(self.store):
53
+ if p2.startswith(starter):
54
+ if "/" not in p2[len(starter) :]:
55
+ # exact child
56
+ out.append(
57
+ {
58
+ "name": p2,
59
+ "size": self.store[p2].size,
60
+ "type": "file",
61
+ "created": self.store[p2].created.timestamp(),
62
+ }
63
+ )
64
+ elif len(p2) > len(starter):
65
+ # implied child directory
66
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
67
+ if ppath not in paths:
68
+ out = out or []
69
+ out.append(
70
+ {
71
+ "name": ppath,
72
+ "size": 0,
73
+ "type": "directory",
74
+ }
75
+ )
76
+ paths.add(ppath)
77
+ for p2 in self.pseudo_dirs:
78
+ if p2.startswith(starter):
79
+ if "/" not in p2[len(starter) :]:
80
+ # exact child pdir
81
+ if p2 not in paths:
82
+ out.append({"name": p2, "size": 0, "type": "directory"})
83
+ paths.add(p2)
84
+ else:
85
+ # directory implied by deeper pdir
86
+ ppath = starter + p2[len(starter) :].split("/", 1)[0]
87
+ if ppath not in paths:
88
+ out.append({"name": ppath, "size": 0, "type": "directory"})
89
+ paths.add(ppath)
90
+ if not out:
91
+ if path in self.pseudo_dirs:
92
+ # empty dir
93
+ return []
94
+ raise FileNotFoundError(path)
95
+ if detail:
96
+ return out
97
+ return sorted([f["name"] for f in out])
98
+
99
+ def mkdir(self, path, create_parents=True, **kwargs):
100
+ path = self._strip_protocol(path)
101
+ if path in self.store or path in self.pseudo_dirs:
102
+ raise FileExistsError(path)
103
+ if self._parent(path).strip("/") and self.isfile(self._parent(path)):
104
+ raise NotADirectoryError(self._parent(path))
105
+ if create_parents and self._parent(path).strip("/"):
106
+ try:
107
+ self.mkdir(self._parent(path), create_parents, **kwargs)
108
+ except FileExistsError:
109
+ pass
110
+ if path and path not in self.pseudo_dirs:
111
+ self.pseudo_dirs.append(path)
112
+
113
+ def makedirs(self, path, exist_ok=False):
114
+ try:
115
+ self.mkdir(path, create_parents=True)
116
+ except FileExistsError:
117
+ if not exist_ok:
118
+ raise
119
+
120
+ def pipe_file(self, path, value, **kwargs):
121
+ """Set the bytes of given file
122
+
123
+ Avoids copies of the data if possible
124
+ """
125
+ self.open(path, "wb", data=value)
126
+
127
+ def rmdir(self, path):
128
+ path = self._strip_protocol(path)
129
+ if path == "":
130
+ # silently avoid deleting FS root
131
+ return
132
+ if path in self.pseudo_dirs:
133
+ if not self.ls(path):
134
+ self.pseudo_dirs.remove(path)
135
+ else:
136
+ raise OSError(ENOTEMPTY, "Directory not empty", path)
137
+ else:
138
+ raise FileNotFoundError(path)
139
+
140
+ def info(self, path, **kwargs):
141
+ path = self._strip_protocol(path)
142
+ if path in self.pseudo_dirs or any(
143
+ p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
144
+ ):
145
+ return {
146
+ "name": path,
147
+ "size": 0,
148
+ "type": "directory",
149
+ }
150
+ elif path in self.store:
151
+ filelike = self.store[path]
152
+ return {
153
+ "name": path,
154
+ "size": filelike.size,
155
+ "type": "file",
156
+ "created": getattr(filelike, "created", None),
157
+ }
158
+ else:
159
+ raise FileNotFoundError(path)
160
+
161
+ def _open(
162
+ self,
163
+ path,
164
+ mode="rb",
165
+ block_size=None,
166
+ autocommit=True,
167
+ cache_options=None,
168
+ **kwargs,
169
+ ):
170
+ path = self._strip_protocol(path)
171
+ if path in self.pseudo_dirs:
172
+ raise IsADirectoryError(path)
173
+ parent = path
174
+ while len(parent) > 1:
175
+ parent = self._parent(parent)
176
+ if self.isfile(parent):
177
+ raise FileExistsError(parent)
178
+ if mode in ["rb", "ab", "r+b"]:
179
+ if path in self.store:
180
+ f = self.store[path]
181
+ if mode == "ab":
182
+ # position at the end of file
183
+ f.seek(0, 2)
184
+ else:
185
+ # position at the beginning of file
186
+ f.seek(0)
187
+ return f
188
+ else:
189
+ raise FileNotFoundError(path)
190
+ elif mode == "wb":
191
+ m = MemoryFile(self, path, kwargs.get("data"))
192
+ if not self._intrans:
193
+ m.commit()
194
+ return m
195
+ else:
196
+ name = self.__class__.__name__
197
+ raise ValueError(f"unsupported file mode for {name}: {mode!r}")
198
+
199
+ def cp_file(self, path1, path2, **kwargs):
200
+ path1 = self._strip_protocol(path1)
201
+ path2 = self._strip_protocol(path2)
202
+ if self.isfile(path1):
203
+ self.store[path2] = MemoryFile(
204
+ self, path2, self.store[path1].getvalue()
205
+ ) # implicit copy
206
+ elif self.isdir(path1):
207
+ if path2 not in self.pseudo_dirs:
208
+ self.pseudo_dirs.append(path2)
209
+ else:
210
+ raise FileNotFoundError(path1)
211
+
212
+ def cat_file(self, path, start=None, end=None, **kwargs):
213
+ path = self._strip_protocol(path)
214
+ try:
215
+ return bytes(self.store[path].getbuffer()[start:end])
216
+ except KeyError:
217
+ raise FileNotFoundError(path)
218
+
219
+ def _rm(self, path):
220
+ path = self._strip_protocol(path)
221
+ try:
222
+ del self.store[path]
223
+ except KeyError as e:
224
+ raise FileNotFoundError(path) from e
225
+
226
+ def modified(self, path):
227
+ path = self._strip_protocol(path)
228
+ try:
229
+ return self.store[path].modified
230
+ except KeyError:
231
+ raise FileNotFoundError(path)
232
+
233
+ def created(self, path):
234
+ path = self._strip_protocol(path)
235
+ try:
236
+ return self.store[path].created
237
+ except KeyError:
238
+ raise FileNotFoundError(path)
239
+
240
+ def rm(self, path, recursive=False, maxdepth=None):
241
+ if isinstance(path, str):
242
+ path = self._strip_protocol(path)
243
+ else:
244
+ path = [self._strip_protocol(p) for p in path]
245
+ paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
246
+ for p in reversed(paths):
247
+ # If the expanded path doesn't exist, it is only because the expanded
248
+ # path was a directory that does not exist in self.pseudo_dirs. This
249
+ # is possible if you directly create files without making the
250
+ # directories first.
251
+ if not self.exists(p):
252
+ continue
253
+ if self.isfile(p):
254
+ self.rm_file(p)
255
+ else:
256
+ self.rmdir(p)
257
+
258
+
259
+ class MemoryFile(BytesIO):
260
+ """A BytesIO which can't close and works as a context manager
261
+
262
+ Can initialise with data. Each path should only be active once at any moment.
263
+
264
+ No need to provide fs, path if auto-committing (default)
265
+ """
266
+
267
+ def __init__(self, fs=None, path=None, data=None):
268
+ logger.debug("open file %s", path)
269
+ self.fs = fs
270
+ self.path = path
271
+ self.created = datetime.now(tz=timezone.utc)
272
+ self.modified = datetime.now(tz=timezone.utc)
273
+ if data:
274
+ super().__init__(data)
275
+ self.seek(0)
276
+
277
+ @property
278
+ def size(self):
279
+ return self.getbuffer().nbytes
280
+
281
+ def __enter__(self):
282
+ return self
283
+
284
+ def close(self):
285
+ pass
286
+
287
+ def discard(self):
288
+ pass
289
+
290
+ def commit(self):
291
+ self.fs.store[self.path] = self
292
+ self.modified = datetime.now(tz=timezone.utc)
lib/python3.11/site-packages/fsspec/implementations/reference.py ADDED
@@ -0,0 +1,1122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import collections
3
+ import io
4
+ import itertools
5
+ import logging
6
+ import math
7
+ import os
8
+ from functools import lru_cache
9
+ from typing import TYPE_CHECKING
10
+
11
+ import fsspec.core
12
+
13
+ try:
14
+ import ujson as json
15
+ except ImportError:
16
+ if not TYPE_CHECKING:
17
+ import json
18
+
19
+ from ..asyn import AsyncFileSystem
20
+ from ..callbacks import _DEFAULT_CALLBACK
21
+ from ..core import filesystem, open, split_protocol
22
+ from ..utils import isfilelike, merge_offset_ranges, other_paths
23
+
24
+ logger = logging.getLogger("fsspec.reference")
25
+
26
+
27
+ class ReferenceNotReachable(RuntimeError):
28
+ def __init__(self, reference, target, *args):
29
+ super().__init__(*args)
30
+ self.reference = reference
31
+ self.target = target
32
+
33
+ def __str__(self):
34
+ return f'Reference "{self.reference}" failed to fetch target {self.target}'
35
+
36
+
37
+ def _first(d):
38
+ return list(d.values())[0]
39
+
40
+
41
+ def _prot_in_references(path, references):
42
+ ref = references.get(path)
43
+ if isinstance(ref, (list, tuple)):
44
+ return split_protocol(ref[0])[0] if ref[0] else ref[0]
45
+
46
+
47
+ def _protocol_groups(paths, references):
48
+ if isinstance(paths, str):
49
+ return {_prot_in_references(paths, references): [paths]}
50
+ out = {}
51
+ for path in paths:
52
+ protocol = _prot_in_references(path, references)
53
+ out.setdefault(protocol, []).append(path)
54
+ return out
55
+
56
+
57
+ class RefsValuesView(collections.abc.ValuesView):
58
+ def __iter__(self):
59
+ for val in self._mapping.zmetadata.values():
60
+ yield json.dumps(val).encode()
61
+ yield from self._mapping._items.values()
62
+ for field in self._mapping.listdir():
63
+ chunk_sizes = self._mapping._get_chunk_sizes(field)
64
+ if len(chunk_sizes) == 0:
65
+ yield self._mapping[field + "/0"]
66
+ continue
67
+ yield from self._mapping._generate_all_records(field)
68
+
69
+
70
+ class RefsItemsView(collections.abc.ItemsView):
71
+ def __iter__(self):
72
+ return zip(self._mapping.keys(), self._mapping.values())
73
+
74
+
75
+ def ravel_multi_index(idx, sizes):
76
+ val = 0
77
+ mult = 1
78
+ for i, s in zip(idx[::-1], sizes[::-1]):
79
+ val += i * mult
80
+ mult *= s
81
+ return val
82
+
83
+
84
+ class LazyReferenceMapper(collections.abc.MutableMapping):
85
+ """This interface can be used to read/write references from Parquet stores.
86
+ It is not intended for other types of references.
87
+ It can be used with Kerchunk's MultiZarrToZarr method to combine
88
+ references into a parquet store.
89
+ Examples of this use-case can be found here:
90
+ https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
91
+
92
+ # import is class level to prevent numpy dep requirement for fsspec
93
+ @property
94
+ def np(self):
95
+ import numpy as np
96
+
97
+ return np
98
+
99
+ @property
100
+ def pd(self):
101
+ import pandas as pd
102
+
103
+ return pd
104
+
105
+ def __init__(
106
+ self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
107
+ ):
108
+ """
109
+ Parameters
110
+ ----------
111
+ root : str
112
+ Root of parquet store
113
+ fs : fsspec.AbstractFileSystem
114
+ fsspec filesystem object, default is local filesystem.
115
+ cache_size : int, default=128
116
+ Maximum size of LRU cache, where cache_size*record_size denotes
117
+ the total number of references that can be loaded in memory at once.
118
+ categorical_threshold : int
119
+ Encode urls as pandas.Categorical to reduce memory footprint if the ratio
120
+ of the number of unique urls to total number of refs for each variable
121
+ is greater than or equal to this number. (default 10)
122
+
123
+
124
+ """
125
+ self.root = root
126
+ self.chunk_sizes = {}
127
+ self._items = {}
128
+ self.dirs = None
129
+ self.fs = fsspec.filesystem("file") if fs is None else fs
130
+ self._items[".zmetadata"] = self.fs.cat_file(
131
+ "/".join([self.root, ".zmetadata"])
132
+ )
133
+ met = json.loads(self._items[".zmetadata"])
134
+ self.record_size = met["record_size"]
135
+ self.zmetadata = met["metadata"]
136
+ self.url = self.root + "/{field}/refs.{record}.parq"
137
+ self.out_root = out_root or self.root
138
+ self.cat_thresh = categorical_threshold
139
+
140
+ # Define function to open and decompress refs
141
+ @lru_cache(maxsize=cache_size)
142
+ def open_refs(field, record):
143
+ """cached parquet file loader"""
144
+ path = self.url.format(field=field, record=record)
145
+ data = io.BytesIO(self.fs.cat_file(path))
146
+ df = self.pd.read_parquet(data, engine="fastparquet")
147
+ refs = {c: df[c].values for c in df.columns}
148
+ return refs
149
+
150
+ self.open_refs = open_refs
151
+
152
+ @staticmethod
153
+ def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
154
+ """Make empty parquet reference set
155
+
156
+ Parameters
157
+ ----------
158
+ root: str
159
+ Directory to contain the output; will be created
160
+ storage_options: dict | None
161
+ For making the filesystem to use for writing is fs is None
162
+ fs: FileSystem | None
163
+ Filesystem for writing
164
+ record_size: int
165
+ Number of references per parquet file
166
+ kwargs: passed to __init__
167
+
168
+ Returns
169
+ -------
170
+ LazyReferenceMapper instance
171
+ """
172
+ met = {"metadata": {}, "record_size": record_size}
173
+ if fs is None:
174
+ fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
175
+ fs.makedirs(root, exist_ok=True)
176
+ fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
177
+ return LazyReferenceMapper(root, fs, **kwargs)
178
+
179
+ def listdir(self, basename=True):
180
+ """List top-level directories"""
181
+ if self.dirs is None:
182
+ dirs = [p.split("/", 1)[0] for p in self.zmetadata]
183
+ self.dirs = {p for p in dirs if p and not p.startswith(".")}
184
+ listing = self.dirs
185
+ if basename:
186
+ listing = [os.path.basename(path) for path in listing]
187
+ return listing
188
+
189
+ def ls(self, path="", detail=True):
190
+ """Shortcut file listings"""
191
+ if not path:
192
+ dirnames = self.listdir()
193
+ others = set(
194
+ [".zmetadata"]
195
+ + [name for name in self.zmetadata if "/" not in name]
196
+ + [name for name in self._items if "/" not in name]
197
+ )
198
+ if detail is False:
199
+ others.update(dirnames)
200
+ return sorted(others)
201
+ dirinfo = [
202
+ {"name": name, "type": "directory", "size": 0} for name in dirnames
203
+ ]
204
+ fileinfo = [
205
+ {
206
+ "name": name,
207
+ "type": "file",
208
+ "size": len(
209
+ json.dumps(self.zmetadata[name])
210
+ if name in self.zmetadata
211
+ else self._items[name]
212
+ ),
213
+ }
214
+ for name in others
215
+ ]
216
+ return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
217
+ parts = path.split("/", 1)
218
+ if len(parts) > 1:
219
+ raise FileNotFoundError("Cannot list within directories right now")
220
+ field = parts[0]
221
+ others = set(
222
+ [name for name in self.zmetadata if name.startswith(f"{path}/")]
223
+ + [name for name in self._items if name.startswith(f"{path}/")]
224
+ )
225
+ fileinfo = [
226
+ {
227
+ "name": name,
228
+ "type": "file",
229
+ "size": len(
230
+ json.dumps(self.zmetadata[name])
231
+ if name in self.zmetadata
232
+ else self._items[name]
233
+ ),
234
+ }
235
+ for name in others
236
+ ]
237
+ keys = self._keys_in_field(field)
238
+
239
+ if detail is False:
240
+ return list(others) + list(keys)
241
+ recs = self._generate_all_records(field)
242
+ recinfo = [
243
+ {"name": name, "type": "file", "size": rec[-1]}
244
+ for name, rec in zip(keys, recs)
245
+ if rec[0] # filters out path==None, deleted/missing
246
+ ]
247
+ return fileinfo + recinfo
248
+
249
+ def _load_one_key(self, key):
250
+ """Get the reference for one key
251
+
252
+ Returns bytes, one-element list or three-element list.
253
+ """
254
+ if key in self._items:
255
+ return self._items[key]
256
+ elif key in self.zmetadata:
257
+ return json.dumps(self.zmetadata[key]).encode()
258
+ elif "/" not in key or self._is_meta(key):
259
+ raise KeyError(key)
260
+ field, sub_key = key.split("/")
261
+ record, _, _ = self._key_to_record(key)
262
+ maybe = self._items.get((field, key), {}).get(sub_key, False)
263
+ if maybe is None:
264
+ # explicitly deleted
265
+ raise KeyError
266
+ elif maybe:
267
+ return maybe
268
+
269
+ # Chunk keys can be loaded from row group and cached in LRU cache
270
+ try:
271
+ record, ri, chunk_size = self._key_to_record(key)
272
+ if chunk_size == 0:
273
+ return b""
274
+ refs = self.open_refs(field, record)
275
+ except (ValueError, TypeError, FileNotFoundError):
276
+ raise KeyError(key)
277
+ columns = ["path", "offset", "size", "raw"]
278
+ selection = [refs[c][ri] if c in refs else None for c in columns]
279
+ raw = selection[-1]
280
+ if raw is not None:
281
+ return raw
282
+ if selection[0] is None:
283
+ raise KeyError("This reference has been deleted")
284
+ if selection[1:3] == [0, 0]:
285
+ # URL only
286
+ return selection[:1]
287
+ # URL, offset, size
288
+ return selection[:3]
289
+
290
+ @lru_cache(4096)
291
+ def _key_to_record(self, key):
292
+ """Details needed to construct a reference for one key"""
293
+ field, chunk = key.split("/")
294
+ chunk_sizes = self._get_chunk_sizes(field)
295
+ if len(chunk_sizes) == 0:
296
+ return 0, 0, 0
297
+ chunk_idx = [int(c) for c in chunk.split(".")]
298
+ chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
299
+ record = chunk_number // self.record_size
300
+ ri = chunk_number % self.record_size
301
+ return record, ri, len(chunk_sizes)
302
+
303
+ def _get_chunk_sizes(self, field):
304
+ """The number of chunks along each axis for a given field"""
305
+ if field not in self.chunk_sizes:
306
+ zarray = self.zmetadata[f"{field}/.zarray"]
307
+ size_ratio = [
308
+ math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
309
+ ]
310
+ self.chunk_sizes[field] = size_ratio
311
+ return self.chunk_sizes[field]
312
+
313
+ def _generate_record(self, field, record):
314
+ """The references for a given parquet file of a given field"""
315
+ refs = self.open_refs(field, record)
316
+ it = iter(zip(*refs.values()))
317
+ if len(refs) == 3:
318
+ # All urls
319
+ return (list(t) for t in it)
320
+ elif len(refs) == 1:
321
+ # All raws
322
+ return refs["raw"]
323
+ else:
324
+ # Mix of urls and raws
325
+ return (list(t[:3]) if not t[3] else t[3] for t in it)
326
+
327
+ def _generate_all_records(self, field):
328
+ """Load all the references within a field by iterating over the parquet files"""
329
+ nrec = 1
330
+ for ch in self._get_chunk_sizes(field):
331
+ nrec *= ch
332
+ nrec = math.ceil(nrec / self.record_size)
333
+ for record in range(nrec):
334
+ yield from self._generate_record(field, record)
335
+
336
+ def values(self):
337
+ return RefsValuesView(self)
338
+
339
+ def items(self):
340
+ return RefsItemsView(self)
341
+
342
+ def __hash__(self):
343
+ return id(self)
344
+
345
+ @lru_cache(20)
346
+ def __getitem__(self, key):
347
+ return self._load_one_key(key)
348
+
349
+ def __setitem__(self, key, value):
350
+ if "/" in key and not self._is_meta(key):
351
+ field, chunk = key.split("/")
352
+ record, i, _ = self._key_to_record(key)
353
+ subdict = self._items.setdefault((field, record), {})
354
+ subdict[i] = value
355
+ if len(subdict) == self.record_size:
356
+ self.write(field, record)
357
+ else:
358
+ # metadata or top-level
359
+ self._items[key] = value
360
+ self.zmetadata[key] = json.loads(
361
+ value.decode() if isinstance(value, bytes) else value
362
+ )
363
+
364
+ @staticmethod
365
+ def _is_meta(key):
366
+ return key.startswith(".z") or "/.z" in key
367
+
368
+ def __delitem__(self, key):
369
+ if key in self._items:
370
+ del self._items[key]
371
+ elif key in self.zmetadata:
372
+ del self.zmetadata[key]
373
+ else:
374
+ if "/" in key and not self._is_meta(key):
375
+ field, chunk = key.split("/")
376
+ record, _, _ = self._key_to_record(key)
377
+ subdict = self._items.setdefault((field, record), {})
378
+ subdict[chunk] = None
379
+ if len(subdict) == self.record_size:
380
+ self.write(field, record)
381
+ else:
382
+ # metadata or top-level
383
+ self._items[key] = None
384
+
385
+ def write(self, field, record, base_url=None, storage_options=None):
386
+ # extra requirements if writing
387
+ import kerchunk.df
388
+ import numpy as np
389
+ import pandas as pd
390
+
391
+ # TODO: if the dict is incomplete, also load records and merge in
392
+ partition = self._items[(field, record)]
393
+ fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
394
+
395
+ ####
396
+ paths = np.full(self.record_size, np.nan, dtype="O")
397
+ offsets = np.zeros(self.record_size, dtype="int64")
398
+ sizes = np.zeros(self.record_size, dtype="int64")
399
+ raws = np.full(self.record_size, np.nan, dtype="O")
400
+ nraw = 0
401
+ npath = 0
402
+ for j, data in partition.items():
403
+ if isinstance(data, list):
404
+ npath += 1
405
+ paths[j] = data[0]
406
+ if len(data) > 1:
407
+ offsets[j] = data[1]
408
+ sizes[j] = data[2]
409
+ else:
410
+ nraw += 1
411
+ raws[j] = kerchunk.df._proc_raw(data)
412
+ # TODO: only save needed columns
413
+ df = pd.DataFrame(
414
+ {
415
+ "path": paths,
416
+ "offset": offsets,
417
+ "size": sizes,
418
+ "raw": raws,
419
+ },
420
+ copy=False,
421
+ )
422
+ if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
423
+ df["path"] = df["path"].astype("category")
424
+ object_encoding = {"raw": "bytes", "path": "utf8"}
425
+ has_nulls = ["path", "raw"]
426
+
427
+ self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
428
+ df.to_parquet(
429
+ fn,
430
+ engine="fastparquet",
431
+ storage_options=storage_options
432
+ or getattr(self.fs, "storage_options", None),
433
+ compression="zstd",
434
+ index=False,
435
+ stats=False,
436
+ object_encoding=object_encoding,
437
+ has_nulls=has_nulls,
438
+ # **kwargs,
439
+ )
440
+ partition.clear()
441
+ self._items.pop((field, record))
442
+
443
+ def flush(self, base_url=None, storage_options=None):
444
+ """Output any modified or deleted keys
445
+
446
+ Parameters
447
+ ----------
448
+ base_url: str
449
+ Location of the output
450
+ """
451
+ # write what we have so far and clear sub chunks
452
+ for thing in list(self._items):
453
+ if isinstance(thing, tuple):
454
+ field, record = thing
455
+ self.write(
456
+ field,
457
+ record,
458
+ base_url=base_url,
459
+ storage_options=storage_options,
460
+ )
461
+
462
+ # gather .zmetadata from self._items and write that too
463
+ for k in list(self._items):
464
+ if k != ".zmetadata" and ".z" in k:
465
+ self.zmetadata[k] = json.loads(self._items.pop(k))
466
+ met = {"metadata": self.zmetadata, "record_size": self.record_size}
467
+ self._items[".zmetadata"] = json.dumps(met).encode()
468
+ self.fs.pipe(
469
+ "/".join([base_url or self.out_root, ".zmetadata"]),
470
+ self._items[".zmetadata"],
471
+ )
472
+
473
+ # TODO: only clear those that we wrote to?
474
+ self.open_refs.cache_clear()
475
+
476
+ def __len__(self):
477
+ # Caveat: This counts expected references, not actual
478
+ count = 0
479
+ for field in self.listdir():
480
+ if field.startswith("."):
481
+ count += 1
482
+ else:
483
+ chunk_sizes = self._get_chunk_sizes(field)
484
+ nchunks = self.np.product(chunk_sizes)
485
+ count += nchunks
486
+ count += len(self.zmetadata) # all metadata keys
487
+ count += len(self._items) # the metadata file itself
488
+ return count
489
+
490
+ def __iter__(self):
491
+ # Caveat: Note that this generates all expected keys, but does not
492
+ # account for reference keys that are missing.
493
+ metas = set(self.zmetadata)
494
+ metas.update(self._items)
495
+ for bit in metas:
496
+ if isinstance(bit, str):
497
+ yield bit
498
+ for field in self.listdir():
499
+ yield from self._keys_in_field(field)
500
+
501
+ def __contains__(self, item):
502
+ try:
503
+ self._load_one_key(item)
504
+ return True
505
+ except KeyError:
506
+ return False
507
+
508
+ def _keys_in_field(self, field):
509
+ """List key names in given field
510
+
511
+ Produces strings like "field/x.y" appropriate from the chunking of the array
512
+ """
513
+ chunk_sizes = self._get_chunk_sizes(field)
514
+ if len(chunk_sizes) == 0:
515
+ yield field + "/0"
516
+ return
517
+ inds = itertools.product(*(range(i) for i in chunk_sizes))
518
+ for ind in inds:
519
+ yield field + "/" + ".".join([str(c) for c in ind])
520
+
521
+
522
+ class ReferenceFileSystem(AsyncFileSystem):
523
+ """View byte ranges of some other file as a file system
524
+ Initial version: single file system target, which must support
525
+ async, and must allow start and end args in _cat_file. Later versions
526
+ may allow multiple arbitrary URLs for the targets.
527
+ This FileSystem is read-only. It is designed to be used with async
528
+ targets (for now). This FileSystem only allows whole-file access, no
529
+ ``open``. We do not get original file details from the target FS.
530
+ Configuration is by passing a dict of references at init, or a URL to
531
+ a JSON file containing the same; this dict
532
+ can also contain concrete data for some set of paths.
533
+ Reference dict format:
534
+ {path0: bytes_data, path1: (target_url, offset, size)}
535
+ https://github.com/fsspec/kerchunk/blob/main/README.md
536
+ """
537
+
538
+ protocol = "reference"
539
+
540
+ def __init__(
541
+ self,
542
+ fo,
543
+ target=None,
544
+ ref_storage_args=None,
545
+ target_protocol=None,
546
+ target_options=None,
547
+ remote_protocol=None,
548
+ remote_options=None,
549
+ fs=None,
550
+ template_overrides=None,
551
+ simple_templates=True,
552
+ max_gap=64_000,
553
+ max_block=256_000_000,
554
+ cache_size=128,
555
+ **kwargs,
556
+ ):
557
+ """
558
+ Parameters
559
+ ----------
560
+ fo : dict or str
561
+ The set of references to use for this instance, with a structure as above.
562
+ If str referencing a JSON file, will use fsspec.open, in conjunction
563
+ with target_options and target_protocol to open and parse JSON at this
564
+ location. If a directory, then assume references are a set of parquet
565
+ files to be loaded lazily.
566
+ target : str
567
+ For any references having target_url as None, this is the default file
568
+ target to use
569
+ ref_storage_args : dict
570
+ If references is a str, use these kwargs for loading the JSON file.
571
+ Deprecated: use target_options instead.
572
+ target_protocol : str
573
+ Used for loading the reference file, if it is a path. If None, protocol
574
+ will be derived from the given path
575
+ target_options : dict
576
+ Extra FS options for loading the reference file ``fo``, if given as a path
577
+ remote_protocol : str
578
+ The protocol of the filesystem on which the references will be evaluated
579
+ (unless fs is provided). If not given, will be derived from the first
580
+ URL that has a protocol in the templates or in the references, in that
581
+ order.
582
+ remote_options : dict
583
+ kwargs to go with remote_protocol
584
+ fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
585
+ Directly provide a file system(s):
586
+ - a single filesystem instance
587
+ - a dict of protocol:filesystem, where each value is either a filesystem
588
+ instance, or a dict of kwargs that can be used to create in
589
+ instance for the given protocol
590
+
591
+ If this is given, remote_options and remote_protocol are ignored.
592
+ template_overrides : dict
593
+ Swap out any templates in the references file with these - useful for
594
+ testing.
595
+ simple_templates: bool
596
+ Whether templates can be processed with simple replace (True) or if
597
+ jinja is needed (False, much slower). All reference sets produced by
598
+ ``kerchunk`` are simple in this sense, but the spec allows for complex.
599
+ max_gap, max_block: int
600
+ For merging multiple concurrent requests to the same remote file.
601
+ Neighboring byte ranges will only be merged when their
602
+ inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
603
+ to only merge when it requires no extra bytes. Pass a negative
604
+ number to disable merging, appropriate for local target files.
605
+ Neighboring byte ranges will only be merged when the size of
606
+ the aggregated range is <= ``max_block``. Default is 256MB.
607
+ cache_size : int
608
+ Maximum size of LRU cache, where cache_size*record_size denotes
609
+ the total number of references that can be loaded in memory at once.
610
+ Only used for lazily loaded references.
611
+ kwargs : passed to parent class
612
+ """
613
+ super().__init__(**kwargs)
614
+ self.target = target
615
+ self.template_overrides = template_overrides
616
+ self.simple_templates = simple_templates
617
+ self.templates = {}
618
+ self.fss = {}
619
+ self._dircache = {}
620
+ self.max_gap = max_gap
621
+ self.max_block = max_block
622
+ if isinstance(fo, str):
623
+ dic = dict(
624
+ **(ref_storage_args or target_options or {}), protocol=target_protocol
625
+ )
626
+ ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
627
+ if ref_fs.isfile(fo2):
628
+ # text JSON
629
+ with fsspec.open(fo, "rb", **dic) as f:
630
+ logger.info("Read reference from URL %s", fo)
631
+ text = json.load(f)
632
+ self._process_references(text, template_overrides)
633
+ else:
634
+ # Lazy parquet refs
635
+ logger.info("Open lazy reference dict from URL %s", fo)
636
+ self.references = LazyReferenceMapper(
637
+ fo2,
638
+ fs=ref_fs,
639
+ cache_size=cache_size,
640
+ )
641
+ else:
642
+ # dictionaries
643
+ self._process_references(fo, template_overrides)
644
+ if isinstance(fs, dict):
645
+ self.fss = {
646
+ k: (
647
+ fsspec.filesystem(k.split(":", 1)[0], **opts)
648
+ if isinstance(opts, dict)
649
+ else opts
650
+ )
651
+ for k, opts in fs.items()
652
+ }
653
+ if None not in self.fss:
654
+ self.fss[None] = filesystem("file")
655
+ return
656
+ if fs is not None:
657
+ # single remote FS
658
+ remote_protocol = (
659
+ fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
660
+ )
661
+ self.fss[remote_protocol] = fs
662
+
663
+ if remote_protocol is None:
664
+ # get single protocol from any templates
665
+ for ref in self.templates.values():
666
+ if callable(ref):
667
+ ref = ref()
668
+ protocol, _ = fsspec.core.split_protocol(ref)
669
+ if protocol and protocol not in self.fss:
670
+ fs = filesystem(protocol, **(remote_options or {}))
671
+ self.fss[protocol] = fs
672
+ if remote_protocol is None:
673
+ # get single protocol from references
674
+ # TODO: warning here, since this can be very expensive?
675
+ for ref in self.references.values():
676
+ if callable(ref):
677
+ ref = ref()
678
+ if isinstance(ref, list) and ref[0]:
679
+ protocol, _ = fsspec.core.split_protocol(ref[0])
680
+ if protocol not in self.fss:
681
+ fs = filesystem(protocol, **(remote_options or {}))
682
+ self.fss[protocol] = fs
683
+ # only use first remote URL
684
+ break
685
+
686
+ if remote_protocol and remote_protocol not in self.fss:
687
+ fs = filesystem(remote_protocol, **(remote_options or {}))
688
+ self.fss[remote_protocol] = fs
689
+
690
+ self.fss[None] = fs or filesystem("file") # default one
691
+
692
+ def _cat_common(self, path, start=None, end=None):
693
+ path = self._strip_protocol(path)
694
+ logger.debug(f"cat: {path}")
695
+ try:
696
+ part = self.references[path]
697
+ except KeyError:
698
+ raise FileNotFoundError(path)
699
+ if isinstance(part, str):
700
+ part = part.encode()
701
+ if isinstance(part, bytes):
702
+ logger.debug(f"Reference: {path}, type bytes")
703
+ if part.startswith(b"base64:"):
704
+ part = base64.b64decode(part[7:])
705
+ return part, None, None
706
+
707
+ if len(part) == 1:
708
+ logger.debug(f"Reference: {path}, whole file => {part}")
709
+ url = part[0]
710
+ start1, end1 = start, end
711
+ else:
712
+ url, start0, size = part
713
+ logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
714
+ end0 = start0 + size
715
+
716
+ if start is not None:
717
+ if start >= 0:
718
+ start1 = start0 + start
719
+ else:
720
+ start1 = end0 + start
721
+ else:
722
+ start1 = start0
723
+ if end is not None:
724
+ if end >= 0:
725
+ end1 = start0 + end
726
+ else:
727
+ end1 = end0 + end
728
+ else:
729
+ end1 = end0
730
+ if url is None:
731
+ url = self.target
732
+ return url, start1, end1
733
+
734
+ async def _cat_file(self, path, start=None, end=None, **kwargs):
735
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
736
+ if isinstance(part_or_url, bytes):
737
+ return part_or_url[start:end]
738
+ protocol, _ = split_protocol(part_or_url)
739
+ try:
740
+ await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
741
+ except Exception as e:
742
+ raise ReferenceNotReachable(path, part_or_url) from e
743
+
744
+ def cat_file(self, path, start=None, end=None, **kwargs):
745
+ part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
746
+ if isinstance(part_or_url, bytes):
747
+ return part_or_url[start:end]
748
+ protocol, _ = split_protocol(part_or_url)
749
+ try:
750
+ return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
751
+ except Exception as e:
752
+ raise ReferenceNotReachable(path, part_or_url) from e
753
+
754
+ def pipe_file(self, path, value, **_):
755
+ """Temporarily add binary data or reference as a file"""
756
+ self.references[path] = value
757
+
758
+ async def _get_file(self, rpath, lpath, **kwargs):
759
+ if self.isdir(rpath):
760
+ return os.makedirs(lpath, exist_ok=True)
761
+ data = await self._cat_file(rpath)
762
+ with open(lpath, "wb") as f:
763
+ f.write(data)
764
+
765
+ def get_file(self, rpath, lpath, callback=_DEFAULT_CALLBACK, **kwargs):
766
+ if self.isdir(rpath):
767
+ return os.makedirs(lpath, exist_ok=True)
768
+ data = self.cat_file(rpath, **kwargs)
769
+ callback.set_size(len(data))
770
+ if isfilelike(lpath):
771
+ lpath.write(data)
772
+ else:
773
+ with open(lpath, "wb") as f:
774
+ f.write(data)
775
+ callback.absolute_update(len(data))
776
+
777
+ def get(self, rpath, lpath, recursive=False, **kwargs):
778
+ if recursive:
779
+ # trigger directory build
780
+ self.ls("")
781
+ rpath = self.expand_path(rpath, recursive=recursive)
782
+ fs = fsspec.filesystem("file", auto_mkdir=True)
783
+ targets = other_paths(rpath, lpath)
784
+ if recursive:
785
+ data = self.cat([r for r in rpath if not self.isdir(r)])
786
+ else:
787
+ data = self.cat(rpath)
788
+ for remote, local in zip(rpath, targets):
789
+ if remote in data:
790
+ fs.pipe_file(local, data[remote])
791
+
792
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
793
+ if isinstance(path, str) and recursive:
794
+ raise NotImplementedError
795
+ if isinstance(path, list) and (recursive or any("*" in p for p in path)):
796
+ raise NotImplementedError
797
+ # TODO: if references is lazy, pre-fetch all paths in batch before access
798
+ proto_dict = _protocol_groups(path, self.references)
799
+ out = {}
800
+ for proto, paths in proto_dict.items():
801
+ fs = self.fss[proto]
802
+ urls, starts, ends, valid_paths = [], [], [], []
803
+ for p in paths:
804
+ # find references or label not-found. Early exit if any not
805
+ # found and on_error is "raise"
806
+ try:
807
+ u, s, e = self._cat_common(p)
808
+ except FileNotFoundError as err:
809
+ if on_error == "raise":
810
+ raise
811
+ if on_error != "omit":
812
+ out[p] = err
813
+ else:
814
+ urls.append(u)
815
+ starts.append(s)
816
+ ends.append(e)
817
+ valid_paths.append(p)
818
+
819
+ # process references into form for merging
820
+ urls2 = []
821
+ starts2 = []
822
+ ends2 = []
823
+ paths2 = []
824
+ whole_files = set()
825
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
826
+ if isinstance(u, bytes):
827
+ # data
828
+ out[p] = u
829
+ elif s is None:
830
+ # whole file - limits are None, None, but no further
831
+ # entries take for this file
832
+ whole_files.add(u)
833
+ urls2.append(u)
834
+ starts2.append(s)
835
+ ends2.append(e)
836
+ paths2.append(p)
837
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
838
+ # second run to account for files that are to be loaded whole
839
+ if s is not None and u not in whole_files:
840
+ urls2.append(u)
841
+ starts2.append(s)
842
+ ends2.append(e)
843
+ paths2.append(p)
844
+
845
+ # merge and fetch consolidated ranges
846
+ new_paths, new_starts, new_ends = merge_offset_ranges(
847
+ list(urls2),
848
+ list(starts2),
849
+ list(ends2),
850
+ sort=True,
851
+ max_gap=self.max_gap,
852
+ max_block=self.max_block,
853
+ )
854
+ bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
855
+
856
+ # unbundle from merged bytes - simple approach
857
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
858
+ if p in out:
859
+ continue # was bytes, already handled
860
+ for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
861
+ if np == u and (ns is None or ne is None):
862
+ if isinstance(b, Exception):
863
+ out[p] = b
864
+ else:
865
+ out[p] = b[s:e]
866
+ elif np == u and s >= ns and e <= ne:
867
+ if isinstance(b, Exception):
868
+ out[p] = b
869
+ else:
870
+ out[p] = b[s - ns : (e - ne) or None]
871
+
872
+ for k, v in out.copy().items():
873
+ # these were valid references, but fetch failed, so transform exc
874
+ if isinstance(v, Exception) and k in self.references:
875
+ ex = out[k]
876
+ new_ex = ReferenceNotReachable(k, self.references[k])
877
+ new_ex.__cause__ = ex
878
+ if on_error == "raise":
879
+ raise new_ex
880
+ elif on_error != "omit":
881
+ out[k] = new_ex
882
+
883
+ if len(out) == 1 and isinstance(path, str) and "*" not in path:
884
+ return _first(out)
885
+ return out
886
+
887
+ def _process_references(self, references, template_overrides=None):
888
+ vers = references.get("version", None)
889
+ if vers is None:
890
+ self._process_references0(references)
891
+ elif vers == 1:
892
+ self._process_references1(references, template_overrides=template_overrides)
893
+ else:
894
+ raise ValueError(f"Unknown reference spec version: {vers}")
895
+ # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
896
+ # can replace with programmatic. Is it even needed for mapper interface?
897
+
898
+ def _process_references0(self, references):
899
+ """Make reference dict for Spec Version 0"""
900
+ self.references = references
901
+
902
+ def _process_references1(self, references, template_overrides=None):
903
+ if not self.simple_templates or self.templates:
904
+ import jinja2
905
+ self.references = {}
906
+ self._process_templates(references.get("templates", {}))
907
+
908
+ @lru_cache(1000)
909
+ def _render_jinja(u):
910
+ return jinja2.Template(u).render(**self.templates)
911
+
912
+ for k, v in references.get("refs", {}).items():
913
+ if isinstance(v, str):
914
+ if v.startswith("base64:"):
915
+ self.references[k] = base64.b64decode(v[7:])
916
+ self.references[k] = v
917
+ elif self.templates:
918
+ u = v[0]
919
+ if "{{" in u:
920
+ if self.simple_templates:
921
+ u = (
922
+ u.replace("{{", "{")
923
+ .replace("}}", "}")
924
+ .format(**self.templates)
925
+ )
926
+ else:
927
+ u = _render_jinja(u)
928
+ self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
929
+ else:
930
+ self.references[k] = v
931
+ self.references.update(self._process_gen(references.get("gen", [])))
932
+
933
+ def _process_templates(self, tmp):
934
+ self.templates = {}
935
+ if self.template_overrides is not None:
936
+ tmp.update(self.template_overrides)
937
+ for k, v in tmp.items():
938
+ if "{{" in v:
939
+ import jinja2
940
+
941
+ self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
942
+ temp
943
+ ).render(**kwargs)
944
+ else:
945
+ self.templates[k] = v
946
+
947
+ def _process_gen(self, gens):
948
+ out = {}
949
+ for gen in gens:
950
+ dimension = {
951
+ k: v
952
+ if isinstance(v, list)
953
+ else range(v.get("start", 0), v["stop"], v.get("step", 1))
954
+ for k, v in gen["dimensions"].items()
955
+ }
956
+ products = (
957
+ dict(zip(dimension.keys(), values))
958
+ for values in itertools.product(*dimension.values())
959
+ )
960
+ for pr in products:
961
+ import jinja2
962
+
963
+ key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
964
+ url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
965
+ if ("offset" in gen) and ("length" in gen):
966
+ offset = int(
967
+ jinja2.Template(gen["offset"]).render(**pr, **self.templates)
968
+ )
969
+ length = int(
970
+ jinja2.Template(gen["length"]).render(**pr, **self.templates)
971
+ )
972
+ out[key] = [url, offset, length]
973
+ elif ("offset" in gen) ^ ("length" in gen):
974
+ raise ValueError(
975
+ "Both 'offset' and 'length' are required for a "
976
+ "reference generator entry if either is provided."
977
+ )
978
+ else:
979
+ out[key] = [url]
980
+ return out
981
+
982
+ def _dircache_from_items(self):
983
+ self.dircache = {"": []}
984
+ it = self.references.items()
985
+ for path, part in it:
986
+ if isinstance(part, (bytes, str)):
987
+ size = len(part)
988
+ elif len(part) == 1:
989
+ size = None
990
+ else:
991
+ _, _, size = part
992
+ par = path.rsplit("/", 1)[0] if "/" in path else ""
993
+ par0 = par
994
+ subdirs = [par0]
995
+ while par0 and par0 not in self.dircache:
996
+ # collect parent directories
997
+ par0 = self._parent(par0)
998
+ subdirs.append(par0)
999
+
1000
+ subdirs = subdirs[::-1]
1001
+ for parent, child in zip(subdirs, subdirs[1:]):
1002
+ # register newly discovered directories
1003
+ assert child not in self.dircache
1004
+ assert parent in self.dircache
1005
+ self.dircache[parent].append(
1006
+ {"name": child, "type": "directory", "size": 0}
1007
+ )
1008
+ self.dircache[child] = []
1009
+
1010
+ self.dircache[par].append({"name": path, "type": "file", "size": size})
1011
+
1012
+ def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
1013
+ data = self.cat_file(path) # load whole chunk into memory
1014
+ return io.BytesIO(data)
1015
+
1016
+ def ls(self, path, detail=True, **kwargs):
1017
+ path = self._strip_protocol(path)
1018
+ if isinstance(self.references, LazyReferenceMapper):
1019
+ try:
1020
+ return self.references.ls(path, detail)
1021
+ except KeyError:
1022
+ pass
1023
+ raise FileNotFoundError(f"'{path}' is not a known key")
1024
+ if not self.dircache:
1025
+ self._dircache_from_items()
1026
+ out = self._ls_from_cache(path)
1027
+ if out is None:
1028
+ raise FileNotFoundError(path)
1029
+ if detail:
1030
+ return out
1031
+ return [o["name"] for o in out]
1032
+
1033
+ def exists(self, path, **kwargs): # overwrite auto-sync version
1034
+ return self.isdir(path) or self.isfile(path)
1035
+
1036
+ def isdir(self, path): # overwrite auto-sync version
1037
+ if self.dircache:
1038
+ return path in self.dircache
1039
+ elif isinstance(self.references, LazyReferenceMapper):
1040
+ return path in self.references.listdir("")
1041
+ else:
1042
+ # this may be faster than building dircache for single calls, but
1043
+ # by looping will be slow for many calls; could cache it?
1044
+ return any(_.startswith(f"{path}/") for _ in self.references)
1045
+
1046
+ def isfile(self, path): # overwrite auto-sync version
1047
+ return path in self.references
1048
+
1049
+ async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
1050
+ return self.ls(path, detail, **kwargs)
1051
+
1052
+ def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
1053
+ if withdirs:
1054
+ return super().find(
1055
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
1056
+ )
1057
+ if path:
1058
+ path = self._strip_protocol(path)
1059
+ r = sorted(k for k in self.references if k.startswith(path))
1060
+ else:
1061
+ r = sorted(self.references)
1062
+ if detail:
1063
+ if not self.dircache:
1064
+ self._dircache_from_items()
1065
+ return {k: self._ls_from_cache(k)[0] for k in r}
1066
+ else:
1067
+ return r
1068
+
1069
+ def info(self, path, **kwargs):
1070
+ out = self.references.get(path)
1071
+ if out is not None:
1072
+ if isinstance(out, (str, bytes)):
1073
+ # decode base64 here
1074
+ return {"name": path, "type": "file", "size": len(out)}
1075
+ elif len(out) > 1:
1076
+ return {"name": path, "type": "file", "size": out[2]}
1077
+ else:
1078
+ out0 = [{"name": path, "type": "file", "size": None}]
1079
+ else:
1080
+ out = self.ls(path, True)
1081
+ out0 = [o for o in out if o["name"] == path]
1082
+ if not out0:
1083
+ return {"name": path, "type": "directory", "size": 0}
1084
+ if out0[0]["size"] is None:
1085
+ # if this is a whole remote file, update size using remote FS
1086
+ prot, _ = split_protocol(self.references[path][0])
1087
+ out0[0]["size"] = self.fss[prot].size(self.references[path][0])
1088
+ return out0[0]
1089
+
1090
+ async def _info(self, path, **kwargs): # calls fast sync code
1091
+ return self.info(path)
1092
+
1093
+ async def _rm_file(self, path, **kwargs):
1094
+ self.references.pop(
1095
+ path, None
1096
+ ) # ignores FileNotFound, just as well for directories
1097
+ self.dircache.clear() # this is a bit heavy handed
1098
+
1099
+ async def _pipe_file(self, path, data):
1100
+ # can be str or bytes
1101
+ self.references[path] = data
1102
+ self.dircache.clear() # this is a bit heavy handed
1103
+
1104
+ async def _put_file(self, lpath, rpath):
1105
+ # puts binary
1106
+ with open(lpath, "rb") as f:
1107
+ self.references[rpath] = f.read()
1108
+ self.dircache.clear() # this is a bit heavy handed
1109
+
1110
+ def save_json(self, url, **storage_options):
1111
+ """Write modified references into new location"""
1112
+ out = {}
1113
+ for k, v in self.references.items():
1114
+ if isinstance(v, bytes):
1115
+ try:
1116
+ out[k] = v.decode("ascii")
1117
+ except UnicodeDecodeError:
1118
+ out[k] = (b"base64:" + base64.b64encode(v)).decode()
1119
+ else:
1120
+ out[k] = v
1121
+ with fsspec.open(url, "wb", **storage_options) as f:
1122
+ f.write(json.dumps({"version": 1, "refs": out}).encode())
lib/python3.11/site-packages/fsspec/implementations/sftp.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ import os
4
+ import types
5
+ import uuid
6
+ from stat import S_ISDIR, S_ISLNK
7
+
8
+ import paramiko
9
+
10
+ from .. import AbstractFileSystem
11
+ from ..utils import infer_storage_options
12
+
13
+ logger = logging.getLogger("fsspec.sftp")
14
+
15
+
16
+ class SFTPFileSystem(AbstractFileSystem):
17
+ """Files over SFTP/SSH
18
+
19
+ Peer-to-peer filesystem over SSH using paramiko.
20
+
21
+ Note: if using this with the ``open`` or ``open_files``, with full URLs,
22
+ there is no way to tell if a path is relative, so all paths are assumed
23
+ to be absolute.
24
+ """
25
+
26
+ protocol = "sftp", "ssh"
27
+
28
+ def __init__(self, host, **ssh_kwargs):
29
+ """
30
+
31
+ Parameters
32
+ ----------
33
+ host: str
34
+ Hostname or IP as a string
35
+ temppath: str
36
+ Location on the server to put files, when within a transaction
37
+ ssh_kwargs: dict
38
+ Parameters passed on to connection. See details in
39
+ https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
40
+ May include port, username, password...
41
+ """
42
+ if self._cached:
43
+ return
44
+ super().__init__(**ssh_kwargs)
45
+ self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
46
+ self.host = host
47
+ self.ssh_kwargs = ssh_kwargs
48
+ self._connect()
49
+
50
+ def _connect(self):
51
+ logger.debug("Connecting to SFTP server %s", self.host)
52
+ self.client = paramiko.SSHClient()
53
+ self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
54
+ self.client.connect(self.host, **self.ssh_kwargs)
55
+ self.ftp = self.client.open_sftp()
56
+
57
+ @classmethod
58
+ def _strip_protocol(cls, path):
59
+ return infer_storage_options(path)["path"]
60
+
61
+ @staticmethod
62
+ def _get_kwargs_from_urls(urlpath):
63
+ out = infer_storage_options(urlpath)
64
+ out.pop("path", None)
65
+ out.pop("protocol", None)
66
+ return out
67
+
68
+ def mkdir(self, path, create_parents=False, mode=511):
69
+ logger.debug("Creating folder %s", path)
70
+ if self.exists(path):
71
+ raise FileExistsError(f"File exists: {path}")
72
+
73
+ if create_parents:
74
+ self.makedirs(path)
75
+ else:
76
+ self.ftp.mkdir(path, mode)
77
+
78
+ def makedirs(self, path, exist_ok=False, mode=511):
79
+ if self.exists(path) and not exist_ok:
80
+ raise FileExistsError(f"File exists: {path}")
81
+
82
+ parts = path.split("/")
83
+ new_path = "/" if path[:1] == "/" else ""
84
+
85
+ for part in parts:
86
+ if part:
87
+ new_path = f"{new_path}/{part}" if new_path else part
88
+ if not self.exists(new_path):
89
+ self.ftp.mkdir(new_path, mode)
90
+
91
+ def rmdir(self, path):
92
+ logger.debug("Removing folder %s", path)
93
+ self.ftp.rmdir(path)
94
+
95
+ def info(self, path):
96
+ stat = self._decode_stat(self.ftp.stat(path))
97
+ stat["name"] = path
98
+ return stat
99
+
100
+ @staticmethod
101
+ def _decode_stat(stat, parent_path=None):
102
+ if S_ISDIR(stat.st_mode):
103
+ t = "directory"
104
+ elif S_ISLNK(stat.st_mode):
105
+ t = "link"
106
+ else:
107
+ t = "file"
108
+ out = {
109
+ "name": "",
110
+ "size": stat.st_size,
111
+ "type": t,
112
+ "uid": stat.st_uid,
113
+ "gid": stat.st_gid,
114
+ "time": datetime.datetime.fromtimestamp(
115
+ stat.st_atime, tz=datetime.timezone.utc
116
+ ),
117
+ "mtime": datetime.datetime.fromtimestamp(
118
+ stat.st_mtime, tz=datetime.timezone.utc
119
+ ),
120
+ }
121
+ if parent_path:
122
+ out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
123
+ return out
124
+
125
+ def ls(self, path, detail=False):
126
+ logger.debug("Listing folder %s", path)
127
+ stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
128
+ if detail:
129
+ return stats
130
+ else:
131
+ paths = [stat["name"] for stat in stats]
132
+ return sorted(paths)
133
+
134
+ def put(self, lpath, rpath, callback=None, **kwargs):
135
+ logger.debug("Put file %s into %s", lpath, rpath)
136
+ self.ftp.put(lpath, rpath)
137
+
138
+ def get_file(self, rpath, lpath, **kwargs):
139
+ if self.isdir(rpath):
140
+ os.makedirs(lpath, exist_ok=True)
141
+ else:
142
+ self.ftp.get(self._strip_protocol(rpath), lpath)
143
+
144
+ def _open(self, path, mode="rb", block_size=None, **kwargs):
145
+ """
146
+ block_size: int or None
147
+ If 0, no buffering, if 1, line buffering, if >1, buffer that many
148
+ bytes, if None use default from paramiko.
149
+ """
150
+ logger.debug("Opening file %s", path)
151
+ if kwargs.get("autocommit", True) is False:
152
+ # writes to temporary file, move on commit
153
+ path2 = "/".join([self.temppath, str(uuid.uuid4())])
154
+ f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
155
+ f.temppath = path2
156
+ f.targetpath = path
157
+ f.fs = self
158
+ f.commit = types.MethodType(commit_a_file, f)
159
+ f.discard = types.MethodType(discard_a_file, f)
160
+ else:
161
+ f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
162
+ return f
163
+
164
+ def _rm(self, path):
165
+ if self.isdir(path):
166
+ self.ftp.rmdir(path)
167
+ else:
168
+ self.ftp.remove(path)
169
+
170
+ def mv(self, old, new):
171
+ logger.debug("Renaming %s into %s", old, new)
172
+ self.ftp.posix_rename(old, new)
173
+
174
+
175
+ def commit_a_file(self):
176
+ self.fs.mv(self.temppath, self.targetpath)
177
+
178
+
179
+ def discard_a_file(self):
180
+ self.fs._rm(self.temppath)
lib/python3.11/site-packages/fsspec/implementations/smb.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains SMBFileSystem class responsible for handling access to
3
+ Windows Samba network shares by using package smbprotocol
4
+ """
5
+
6
+ import datetime
7
+ import uuid
8
+ from stat import S_ISDIR, S_ISLNK
9
+
10
+ import smbclient
11
+
12
+ from .. import AbstractFileSystem
13
+ from ..utils import infer_storage_options
14
+
15
+ # ! pylint: disable=bad-continuation
16
+
17
+
18
+ class SMBFileSystem(AbstractFileSystem):
19
+ """Allow reading and writing to Windows and Samba network shares.
20
+
21
+ When using `fsspec.open()` for getting a file-like object the URI
22
+ should be specified as this format:
23
+ ``smb://workgroup;user:password@server:port/share/folder/file.csv``.
24
+
25
+ Example::
26
+
27
+ >>> import fsspec
28
+ >>> with fsspec.open(
29
+ ... 'smb://myuser:[email protected]/' 'share/folder/file.csv'
30
+ ... ) as smbfile:
31
+ ... df = pd.read_csv(smbfile, sep='|', header=None)
32
+
33
+ Note that you need to pass in a valid hostname or IP address for the host
34
+ component of the URL. Do not use the Windows/NetBIOS machine name for the
35
+ host component.
36
+
37
+ The first component of the path in the URL points to the name of the shared
38
+ folder. Subsequent path components will point to the directory/folder/file.
39
+
40
+ The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
41
+ optional.
42
+
43
+ .. note::
44
+
45
+ For working this source require `smbprotocol`_ to be installed, e.g.::
46
+
47
+ $ pip install smbprotocol
48
+ # or
49
+ # pip install smbprotocol[kerberos]
50
+
51
+ .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
52
+
53
+ Note: if using this with the ``open`` or ``open_files``, with full URLs,
54
+ there is no way to tell if a path is relative, so all paths are assumed
55
+ to be absolute.
56
+ """
57
+
58
+ protocol = "smb"
59
+
60
+ # pylint: disable=too-many-arguments
61
+ def __init__(
62
+ self,
63
+ host,
64
+ port=None,
65
+ username=None,
66
+ password=None,
67
+ timeout=60,
68
+ encrypt=None,
69
+ share_access=None,
70
+ **kwargs,
71
+ ):
72
+ """
73
+ You can use _get_kwargs_from_urls to get some kwargs from
74
+ a reasonable SMB url.
75
+
76
+ Authentication will be anonymous or integrated if username/password are not
77
+ given.
78
+
79
+ Parameters
80
+ ----------
81
+ host: str
82
+ The remote server name/ip to connect to
83
+ port: int or None
84
+ Port to connect with. Usually 445, sometimes 139.
85
+ username: str or None
86
+ Username to connect with. Required if Kerberos auth is not being used.
87
+ password: str or None
88
+ User's password on the server, if using username
89
+ timeout: int
90
+ Connection timeout in seconds
91
+ encrypt: bool
92
+ Whether to force encryption or not, once this has been set to True
93
+ the session cannot be changed back to False.
94
+ share_access: str or None
95
+ Specifies the default access applied to file open operations
96
+ performed with this file system object.
97
+ This affects whether other processes can concurrently open a handle
98
+ to the same file.
99
+
100
+ - None (the default): exclusively locks the file until closed.
101
+ - 'r': Allow other handles to be opened with read access.
102
+ - 'w': Allow other handles to be opened with write access.
103
+ - 'd': Allow other handles to be opened with delete access.
104
+ """
105
+ super().__init__(**kwargs)
106
+ self.host = host
107
+ self.port = port
108
+ self.username = username
109
+ self.password = password
110
+ self.timeout = timeout
111
+ self.encrypt = encrypt
112
+ self.temppath = kwargs.pop("temppath", "")
113
+ self.share_access = share_access
114
+ self._connect()
115
+
116
+ @property
117
+ def _port(self):
118
+ return 445 if self.port is None else self.port
119
+
120
+ def _connect(self):
121
+ smbclient.register_session(
122
+ self.host,
123
+ username=self.username,
124
+ password=self.password,
125
+ port=self._port,
126
+ encrypt=self.encrypt,
127
+ connection_timeout=self.timeout,
128
+ )
129
+
130
+ @classmethod
131
+ def _strip_protocol(cls, path):
132
+ return infer_storage_options(path)["path"]
133
+
134
+ @staticmethod
135
+ def _get_kwargs_from_urls(path):
136
+ # smb://workgroup;user:password@host:port/share/folder/file.csv
137
+ out = infer_storage_options(path)
138
+ out.pop("path", None)
139
+ out.pop("protocol", None)
140
+ return out
141
+
142
+ def mkdir(self, path, create_parents=True, **kwargs):
143
+ wpath = _as_unc_path(self.host, path)
144
+ if create_parents:
145
+ smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
146
+ else:
147
+ smbclient.mkdir(wpath, port=self._port, **kwargs)
148
+
149
+ def makedirs(self, path, exist_ok=False):
150
+ if _share_has_path(path):
151
+ wpath = _as_unc_path(self.host, path)
152
+ smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
153
+
154
+ def rmdir(self, path):
155
+ if _share_has_path(path):
156
+ wpath = _as_unc_path(self.host, path)
157
+ smbclient.rmdir(wpath, port=self._port)
158
+
159
+ def info(self, path, **kwargs):
160
+ wpath = _as_unc_path(self.host, path)
161
+ stats = smbclient.stat(wpath, port=self._port, **kwargs)
162
+ if S_ISDIR(stats.st_mode):
163
+ stype = "directory"
164
+ elif S_ISLNK(stats.st_mode):
165
+ stype = "link"
166
+ else:
167
+ stype = "file"
168
+ res = {
169
+ "name": path + "/" if stype == "directory" else path,
170
+ "size": stats.st_size,
171
+ "type": stype,
172
+ "uid": stats.st_uid,
173
+ "gid": stats.st_gid,
174
+ "time": stats.st_atime,
175
+ "mtime": stats.st_mtime,
176
+ }
177
+ return res
178
+
179
+ def created(self, path):
180
+ """Return the created timestamp of a file as a datetime.datetime"""
181
+ wpath = _as_unc_path(self.host, path)
182
+ stats = smbclient.stat(wpath, port=self._port)
183
+ return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
184
+
185
+ def modified(self, path):
186
+ """Return the modified timestamp of a file as a datetime.datetime"""
187
+ wpath = _as_unc_path(self.host, path)
188
+ stats = smbclient.stat(wpath, port=self._port)
189
+ return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
190
+
191
+ def ls(self, path, detail=True, **kwargs):
192
+ unc = _as_unc_path(self.host, path)
193
+ listed = smbclient.listdir(unc, port=self._port, **kwargs)
194
+ dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
195
+ if detail:
196
+ dirs = [self.info(d) for d in dirs]
197
+ return dirs
198
+
199
+ # pylint: disable=too-many-arguments
200
+ def _open(
201
+ self,
202
+ path,
203
+ mode="rb",
204
+ block_size=-1,
205
+ autocommit=True,
206
+ cache_options=None,
207
+ **kwargs,
208
+ ):
209
+ """
210
+ block_size: int or None
211
+ If 0, no buffering, 1, line buffering, >1, buffer that many bytes
212
+
213
+ Notes
214
+ -----
215
+ By specifying 'share_access' in 'kwargs' it is possible to override the
216
+ default shared access setting applied in the constructor of this object.
217
+ """
218
+ bls = block_size if block_size is not None and block_size >= 0 else -1
219
+ wpath = _as_unc_path(self.host, path)
220
+ share_access = kwargs.pop("share_access", self.share_access)
221
+ if "w" in mode and autocommit is False:
222
+ temp = _as_temp_path(self.host, path, self.temppath)
223
+ return SMBFileOpener(
224
+ wpath, temp, mode, port=self._port, block_size=bls, **kwargs
225
+ )
226
+ return smbclient.open_file(
227
+ wpath,
228
+ mode,
229
+ buffering=bls,
230
+ share_access=share_access,
231
+ port=self._port,
232
+ **kwargs,
233
+ )
234
+
235
+ def copy(self, path1, path2, **kwargs):
236
+ """Copy within two locations in the same filesystem"""
237
+ wpath1 = _as_unc_path(self.host, path1)
238
+ wpath2 = _as_unc_path(self.host, path2)
239
+ smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
240
+
241
+ def _rm(self, path):
242
+ if _share_has_path(path):
243
+ wpath = _as_unc_path(self.host, path)
244
+ stats = smbclient.stat(wpath, port=self._port)
245
+ if S_ISDIR(stats.st_mode):
246
+ smbclient.rmdir(wpath, port=self._port)
247
+ else:
248
+ smbclient.remove(wpath, port=self._port)
249
+
250
+ def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
251
+ wpath1 = _as_unc_path(self.host, path1)
252
+ wpath2 = _as_unc_path(self.host, path2)
253
+ smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
254
+
255
+
256
+ def _as_unc_path(host, path):
257
+ rpath = path.replace("/", "\\")
258
+ unc = f"\\\\{host}{rpath}"
259
+ return unc
260
+
261
+
262
+ def _as_temp_path(host, path, temppath):
263
+ share = path.split("/")[1]
264
+ temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
265
+ unc = _as_unc_path(host, temp_file)
266
+ return unc
267
+
268
+
269
+ def _share_has_path(path):
270
+ parts = path.count("/")
271
+ if path.endswith("/"):
272
+ return parts > 2
273
+ return parts > 1
274
+
275
+
276
+ class SMBFileOpener:
277
+ """writes to remote temporary file, move on commit"""
278
+
279
+ def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
280
+ self.path = path
281
+ self.temp = temp
282
+ self.mode = mode
283
+ self.block_size = block_size
284
+ self.kwargs = kwargs
285
+ self.smbfile = None
286
+ self._incontext = False
287
+ self.port = port
288
+ self._open()
289
+
290
+ def _open(self):
291
+ if self.smbfile is None or self.smbfile.closed:
292
+ self.smbfile = smbclient.open_file(
293
+ self.temp,
294
+ self.mode,
295
+ port=self.port,
296
+ buffering=self.block_size,
297
+ **self.kwargs,
298
+ )
299
+
300
+ def commit(self):
301
+ """Move temp file to definitive on success."""
302
+ # TODO: use transaction support in SMB protocol
303
+ smbclient.replace(self.temp, self.path, port=self.port)
304
+
305
+ def discard(self):
306
+ """Remove the temp file on failure."""
307
+ smbclient.remove(self.temp, port=self.port)
308
+
309
+ def __fspath__(self):
310
+ return self.path
311
+
312
+ def __iter__(self):
313
+ return self.smbfile.__iter__()
314
+
315
+ def __getattr__(self, item):
316
+ return getattr(self.smbfile, item)
317
+
318
+ def __enter__(self):
319
+ self._incontext = True
320
+ return self.smbfile.__enter__()
321
+
322
+ def __exit__(self, exc_type, exc_value, traceback):
323
+ self._incontext = False
324
+ self.smbfile.__exit__(exc_type, exc_value, traceback)
lib/python3.11/site-packages/fsspec/implementations/tar.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import tarfile
3
+
4
+ import fsspec
5
+ from fsspec.archive import AbstractArchiveFileSystem
6
+ from fsspec.compression import compr
7
+ from fsspec.utils import infer_compression
8
+
9
+ typemap = {b"0": "file", b"5": "directory"}
10
+
11
+ logger = logging.getLogger("tar")
12
+
13
+
14
+ class TarFileSystem(AbstractArchiveFileSystem):
15
+ """Compressed Tar archives as a file-system (read-only)
16
+
17
+ Supports the following formats:
18
+ tar.gz, tar.bz2, tar.xz
19
+ """
20
+
21
+ root_marker = ""
22
+ protocol = "tar"
23
+ cachable = False
24
+
25
+ def __init__(
26
+ self,
27
+ fo="",
28
+ index_store=None,
29
+ target_options=None,
30
+ target_protocol=None,
31
+ compression=None,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+ target_options = target_options or {}
36
+
37
+ if isinstance(fo, str):
38
+ self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
39
+ fo = self.of.open() # keep the reference
40
+
41
+ # Try to infer compression.
42
+ if compression is None:
43
+ name = None
44
+
45
+ # Try different ways to get hold of the filename. `fo` might either
46
+ # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
47
+ # `fsspec.AbstractFileSystem` instance.
48
+ try:
49
+ # Amended io.BufferedReader or similar.
50
+ # This uses a "protocol extension" where original filenames are
51
+ # propagated to archive-like filesystems in order to let them
52
+ # infer the right compression appropriately.
53
+ if hasattr(fo, "original"):
54
+ name = fo.original
55
+
56
+ # fsspec.LocalFileOpener
57
+ elif hasattr(fo, "path"):
58
+ name = fo.path
59
+
60
+ # io.BufferedReader
61
+ elif hasattr(fo, "name"):
62
+ name = fo.name
63
+
64
+ # fsspec.AbstractFileSystem
65
+ elif hasattr(fo, "info"):
66
+ name = fo.info()["name"]
67
+
68
+ except Exception as ex:
69
+ logger.warning(
70
+ f"Unable to determine file name, not inferring compression: {ex}"
71
+ )
72
+
73
+ if name is not None:
74
+ compression = infer_compression(name)
75
+ logger.info(f"Inferred compression {compression} from file name {name}")
76
+
77
+ if compression is not None:
78
+ # TODO: tarfile already implements compression with modes like "'r:gz'",
79
+ # but then would seek to offset in the file work?
80
+ fo = compr[compression](fo)
81
+
82
+ self._fo_ref = fo
83
+ self.fo = fo # the whole instance is a context
84
+ self.tar = tarfile.TarFile(fileobj=self.fo)
85
+ self.dir_cache = None
86
+
87
+ self.index_store = index_store
88
+ self.index = None
89
+ self._index()
90
+
91
+ def _index(self):
92
+ # TODO: load and set saved index, if exists
93
+ out = {}
94
+ for ti in self.tar:
95
+ info = ti.get_info()
96
+ info["type"] = typemap.get(info["type"], "file")
97
+ name = ti.get_info()["name"].rstrip("/")
98
+ out[name] = (info, ti.offset_data)
99
+
100
+ self.index = out
101
+ # TODO: save index to self.index_store here, if set
102
+
103
+ def _get_dirs(self):
104
+ if self.dir_cache is not None:
105
+ return
106
+
107
+ # This enables ls to get directories as children as well as files
108
+ self.dir_cache = {
109
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
110
+ for dirname in self._all_dirnames(self.tar.getnames())
111
+ }
112
+ for member in self.tar.getmembers():
113
+ info = member.get_info()
114
+ info["name"] = info["name"].rstrip("/")
115
+ info["type"] = typemap.get(info["type"], "file")
116
+ self.dir_cache[info["name"]] = info
117
+
118
+ def _open(self, path, mode="rb", **kwargs):
119
+ if mode != "rb":
120
+ raise ValueError("Read-only filesystem implementation")
121
+ details, offset = self.index[path]
122
+ if details["type"] != "file":
123
+ raise ValueError("Can only handle regular files")
124
+ return self.tar.extractfile(path)
lib/python3.11/site-packages/fsspec/implementations/webhdfs.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
2
+
3
+ import logging
4
+ import os
5
+ import secrets
6
+ import shutil
7
+ import tempfile
8
+ import uuid
9
+ from contextlib import suppress
10
+ from urllib.parse import quote
11
+
12
+ import requests
13
+
14
+ from ..spec import AbstractBufferedFile, AbstractFileSystem
15
+ from ..utils import infer_storage_options, tokenize
16
+
17
+ logger = logging.getLogger("webhdfs")
18
+
19
+
20
+ class WebHDFS(AbstractFileSystem):
21
+ """
22
+ Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
23
+
24
+ Four auth mechanisms are supported:
25
+
26
+ insecure: no auth is done, and the user is assumed to be whoever they
27
+ say they are (parameter ``user``), or a predefined value such as
28
+ "dr.who" if not given
29
+ spnego: when kerberos authentication is enabled, auth is negotiated by
30
+ requests_kerberos https://github.com/requests/requests-kerberos .
31
+ This establishes a session based on existing kinit login and/or
32
+ specified principal/password; parameters are passed with ``kerb_kwargs``
33
+ token: uses an existing Hadoop delegation token from another secured
34
+ service. Indeed, this client can also generate such tokens when
35
+ not insecure. Note that tokens expire, but can be renewed (by a
36
+ previously specified user) and may allow for proxying.
37
+ basic-auth: used when both parameter ``user`` and parameter ``password``
38
+ are provided.
39
+
40
+ """
41
+
42
+ tempdir = str(tempfile.gettempdir())
43
+ protocol = "webhdfs", "webHDFS"
44
+
45
+ def __init__(
46
+ self,
47
+ host,
48
+ port=50070,
49
+ kerberos=False,
50
+ token=None,
51
+ user=None,
52
+ password=None,
53
+ proxy_to=None,
54
+ kerb_kwargs=None,
55
+ data_proxy=None,
56
+ use_https=False,
57
+ **kwargs,
58
+ ):
59
+ """
60
+ Parameters
61
+ ----------
62
+ host: str
63
+ Name-node address
64
+ port: int
65
+ Port for webHDFS
66
+ kerberos: bool
67
+ Whether to authenticate with kerberos for this connection
68
+ token: str or None
69
+ If given, use this token on every call to authenticate. A user
70
+ and user-proxy may be encoded in the token and should not be also
71
+ given
72
+ user: str or None
73
+ If given, assert the user name to connect with
74
+ password: str or None
75
+ If given, assert the password to use for basic auth. If password
76
+ is provided, user must be provided also
77
+ proxy_to: str or None
78
+ If given, the user has the authority to proxy, and this value is
79
+ the user in who's name actions are taken
80
+ kerb_kwargs: dict
81
+ Any extra arguments for HTTPKerberosAuth, see
82
+ `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
83
+ data_proxy: dict, callable or None
84
+ If given, map data-node addresses. This can be necessary if the
85
+ HDFS cluster is behind a proxy, running on Docker or otherwise has
86
+ a mismatch between the host-names given by the name-node and the
87
+ address by which to refer to them from the client. If a dict,
88
+ maps host names ``host->data_proxy[host]``; if a callable, full
89
+ URLs are passed, and function must conform to
90
+ ``url->data_proxy(url)``.
91
+ use_https: bool
92
+ Whether to connect to the Name-node using HTTPS instead of HTTP
93
+ kwargs
94
+ """
95
+ if self._cached:
96
+ return
97
+ super().__init__(**kwargs)
98
+ self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
99
+ self.kerb = kerberos
100
+ self.kerb_kwargs = kerb_kwargs or {}
101
+ self.pars = {}
102
+ self.proxy = data_proxy or {}
103
+ if token is not None:
104
+ if user is not None or proxy_to is not None:
105
+ raise ValueError(
106
+ "If passing a delegation token, must not set "
107
+ "user or proxy_to, as these are encoded in the"
108
+ " token"
109
+ )
110
+ self.pars["delegation"] = token
111
+ self.user = user
112
+ self.password = password
113
+
114
+ if password is not None:
115
+ if user is None:
116
+ raise ValueError(
117
+ "If passing a password, the user must also be"
118
+ "set in order to set up the basic-auth"
119
+ )
120
+ else:
121
+ if user is not None:
122
+ self.pars["user.name"] = user
123
+
124
+ if proxy_to is not None:
125
+ self.pars["doas"] = proxy_to
126
+ if kerberos and user is not None:
127
+ raise ValueError(
128
+ "If using Kerberos auth, do not specify the "
129
+ "user, this is handled by kinit."
130
+ )
131
+ self._connect()
132
+
133
+ self._fsid = f"webhdfs_{tokenize(host, port)}"
134
+
135
+ @property
136
+ def fsid(self):
137
+ return self._fsid
138
+
139
+ def _connect(self):
140
+ self.session = requests.Session()
141
+ if self.kerb:
142
+ from requests_kerberos import HTTPKerberosAuth
143
+
144
+ self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
145
+
146
+ if self.user is not None and self.password is not None:
147
+ from requests.auth import HTTPBasicAuth
148
+
149
+ self.session.auth = HTTPBasicAuth(self.user, self.password)
150
+
151
+ def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
152
+ url = self._apply_proxy(self.url + quote(path or "", safe="/="))
153
+ args = kwargs.copy()
154
+ args.update(self.pars)
155
+ args["op"] = op.upper()
156
+ logger.debug("sending %s with %s", url, method)
157
+ out = self.session.request(
158
+ method=method.upper(),
159
+ url=url,
160
+ params=args,
161
+ data=data,
162
+ allow_redirects=redirect,
163
+ )
164
+ if out.status_code in [400, 401, 403, 404, 500]:
165
+ try:
166
+ err = out.json()
167
+ msg = err["RemoteException"]["message"]
168
+ exp = err["RemoteException"]["exception"]
169
+ except (ValueError, KeyError):
170
+ pass
171
+ else:
172
+ if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
173
+ raise ValueError(msg)
174
+ elif exp in ["SecurityException", "AccessControlException"]:
175
+ raise PermissionError(msg)
176
+ elif exp in ["FileNotFoundException"]:
177
+ raise FileNotFoundError(msg)
178
+ else:
179
+ raise RuntimeError(msg)
180
+ out.raise_for_status()
181
+ return out
182
+
183
+ def _open(
184
+ self,
185
+ path,
186
+ mode="rb",
187
+ block_size=None,
188
+ autocommit=True,
189
+ replication=None,
190
+ permissions=None,
191
+ **kwargs,
192
+ ):
193
+ """
194
+
195
+ Parameters
196
+ ----------
197
+ path: str
198
+ File location
199
+ mode: str
200
+ 'rb', 'wb', etc.
201
+ block_size: int
202
+ Client buffer size for read-ahead or write buffer
203
+ autocommit: bool
204
+ If False, writes to temporary file that only gets put in final
205
+ location upon commit
206
+ replication: int
207
+ Number of copies of file on the cluster, write mode only
208
+ permissions: str or int
209
+ posix permissions, write mode only
210
+ kwargs
211
+
212
+ Returns
213
+ -------
214
+ WebHDFile instance
215
+ """
216
+ block_size = block_size or self.blocksize
217
+ return WebHDFile(
218
+ self,
219
+ path,
220
+ mode=mode,
221
+ block_size=block_size,
222
+ tempdir=self.tempdir,
223
+ autocommit=autocommit,
224
+ replication=replication,
225
+ permissions=permissions,
226
+ )
227
+
228
+ @staticmethod
229
+ def _process_info(info):
230
+ info["type"] = info["type"].lower()
231
+ info["size"] = info["length"]
232
+ return info
233
+
234
+ @classmethod
235
+ def _strip_protocol(cls, path):
236
+ return infer_storage_options(path)["path"]
237
+
238
+ @staticmethod
239
+ def _get_kwargs_from_urls(urlpath):
240
+ out = infer_storage_options(urlpath)
241
+ out.pop("path", None)
242
+ out.pop("protocol", None)
243
+ if "username" in out:
244
+ out["user"] = out.pop("username")
245
+ return out
246
+
247
+ def info(self, path):
248
+ out = self._call("GETFILESTATUS", path=path)
249
+ info = out.json()["FileStatus"]
250
+ info["name"] = path
251
+ return self._process_info(info)
252
+
253
+ def ls(self, path, detail=False):
254
+ out = self._call("LISTSTATUS", path=path)
255
+ infos = out.json()["FileStatuses"]["FileStatus"]
256
+ for info in infos:
257
+ self._process_info(info)
258
+ info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
259
+ if detail:
260
+ return sorted(infos, key=lambda i: i["name"])
261
+ else:
262
+ return sorted(info["name"] for info in infos)
263
+
264
+ def content_summary(self, path):
265
+ """Total numbers of files, directories and bytes under path"""
266
+ out = self._call("GETCONTENTSUMMARY", path=path)
267
+ return out.json()["ContentSummary"]
268
+
269
+ def ukey(self, path):
270
+ """Checksum info of file, giving method and result"""
271
+ out = self._call("GETFILECHECKSUM", path=path, redirect=False)
272
+ if "Location" in out.headers:
273
+ location = self._apply_proxy(out.headers["Location"])
274
+ out2 = self.session.get(location)
275
+ out2.raise_for_status()
276
+ return out2.json()["FileChecksum"]
277
+ else:
278
+ out.raise_for_status()
279
+ return out.json()["FileChecksum"]
280
+
281
+ def home_directory(self):
282
+ """Get user's home directory"""
283
+ out = self._call("GETHOMEDIRECTORY")
284
+ return out.json()["Path"]
285
+
286
+ def get_delegation_token(self, renewer=None):
287
+ """Retrieve token which can give the same authority to other uses
288
+
289
+ Parameters
290
+ ----------
291
+ renewer: str or None
292
+ User who may use this token; if None, will be current user
293
+ """
294
+ if renewer:
295
+ out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
296
+ else:
297
+ out = self._call("GETDELEGATIONTOKEN")
298
+ t = out.json()["Token"]
299
+ if t is None:
300
+ raise ValueError("No token available for this user/security context")
301
+ return t["urlString"]
302
+
303
+ def renew_delegation_token(self, token):
304
+ """Make token live longer. Returns new expiry time"""
305
+ out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
306
+ return out.json()["long"]
307
+
308
+ def cancel_delegation_token(self, token):
309
+ """Stop the token from being useful"""
310
+ self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
311
+
312
+ def chmod(self, path, mod):
313
+ """Set the permission at path
314
+
315
+ Parameters
316
+ ----------
317
+ path: str
318
+ location to set (file or directory)
319
+ mod: str or int
320
+ posix epresentation or permission, give as oct string, e.g, '777'
321
+ or 0o777
322
+ """
323
+ self._call("SETPERMISSION", method="put", path=path, permission=mod)
324
+
325
+ def chown(self, path, owner=None, group=None):
326
+ """Change owning user and/or group"""
327
+ kwargs = {}
328
+ if owner is not None:
329
+ kwargs["owner"] = owner
330
+ if group is not None:
331
+ kwargs["group"] = group
332
+ self._call("SETOWNER", method="put", path=path, **kwargs)
333
+
334
+ def set_replication(self, path, replication):
335
+ """
336
+ Set file replication factor
337
+
338
+ Parameters
339
+ ----------
340
+ path: str
341
+ File location (not for directories)
342
+ replication: int
343
+ Number of copies of file on the cluster. Should be smaller than
344
+ number of data nodes; normally 3 on most systems.
345
+ """
346
+ self._call("SETREPLICATION", path=path, method="put", replication=replication)
347
+
348
+ def mkdir(self, path, **kwargs):
349
+ self._call("MKDIRS", method="put", path=path)
350
+
351
+ def makedirs(self, path, exist_ok=False):
352
+ if exist_ok is False and self.exists(path):
353
+ raise FileExistsError(path)
354
+ self.mkdir(path)
355
+
356
+ def mv(self, path1, path2, **kwargs):
357
+ self._call("RENAME", method="put", path=path1, destination=path2)
358
+
359
+ def rm(self, path, recursive=False, **kwargs):
360
+ self._call(
361
+ "DELETE",
362
+ method="delete",
363
+ path=path,
364
+ recursive="true" if recursive else "false",
365
+ )
366
+
367
+ def rm_file(self, path, **kwargs):
368
+ self.rm(path)
369
+
370
+ def cp_file(self, lpath, rpath, **kwargs):
371
+ with self.open(lpath) as lstream:
372
+ tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
373
+ # Perform an atomic copy (stream to a temporary file and
374
+ # move it to the actual destination).
375
+ try:
376
+ with self.open(tmp_fname, "wb") as rstream:
377
+ shutil.copyfileobj(lstream, rstream)
378
+ self.mv(tmp_fname, rpath)
379
+ except BaseException: # noqa
380
+ with suppress(FileNotFoundError):
381
+ self.rm(tmp_fname)
382
+ raise
383
+
384
+ def _apply_proxy(self, location):
385
+ if self.proxy and callable(self.proxy):
386
+ location = self.proxy(location)
387
+ elif self.proxy:
388
+ # as a dict
389
+ for k, v in self.proxy.items():
390
+ location = location.replace(k, v, 1)
391
+ return location
392
+
393
+
394
+ class WebHDFile(AbstractBufferedFile):
395
+ """A file living in HDFS over webHDFS"""
396
+
397
+ def __init__(self, fs, path, **kwargs):
398
+ super().__init__(fs, path, **kwargs)
399
+ kwargs = kwargs.copy()
400
+ if kwargs.get("permissions", None) is None:
401
+ kwargs.pop("permissions", None)
402
+ if kwargs.get("replication", None) is None:
403
+ kwargs.pop("replication", None)
404
+ self.permissions = kwargs.pop("permissions", 511)
405
+ tempdir = kwargs.pop("tempdir")
406
+ if kwargs.pop("autocommit", False) is False:
407
+ self.target = self.path
408
+ self.path = os.path.join(tempdir, str(uuid.uuid4()))
409
+
410
+ def _upload_chunk(self, final=False):
411
+ """Write one part of a multi-block file upload
412
+
413
+ Parameters
414
+ ==========
415
+ final: bool
416
+ This is the last block, so should complete file, if
417
+ self.autocommit is True.
418
+ """
419
+ out = self.fs.session.post(
420
+ self.location,
421
+ data=self.buffer.getvalue(),
422
+ headers={"content-type": "application/octet-stream"},
423
+ )
424
+ out.raise_for_status()
425
+ return True
426
+
427
+ def _initiate_upload(self):
428
+ """Create remote file/upload"""
429
+ kwargs = self.kwargs.copy()
430
+ if "a" in self.mode:
431
+ op, method = "APPEND", "POST"
432
+ else:
433
+ op, method = "CREATE", "PUT"
434
+ kwargs["overwrite"] = "true"
435
+ out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
436
+ location = self.fs._apply_proxy(out.headers["Location"])
437
+ if "w" in self.mode:
438
+ # create empty file to append to
439
+ out2 = self.fs.session.put(
440
+ location, headers={"content-type": "application/octet-stream"}
441
+ )
442
+ out2.raise_for_status()
443
+ # after creating empty file, change location to append to
444
+ out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
445
+ self.location = self.fs._apply_proxy(out2.headers["Location"])
446
+
447
+ def _fetch_range(self, start, end):
448
+ start = max(start, 0)
449
+ end = min(self.size, end)
450
+ if start >= end or start >= self.size:
451
+ return b""
452
+ out = self.fs._call(
453
+ "OPEN", path=self.path, offset=start, length=end - start, redirect=False
454
+ )
455
+ out.raise_for_status()
456
+ if "Location" in out.headers:
457
+ location = out.headers["Location"]
458
+ out2 = self.fs.session.get(self.fs._apply_proxy(location))
459
+ return out2.content
460
+ else:
461
+ return out.content
462
+
463
+ def commit(self):
464
+ self.fs.mv(self.path, self.target)
465
+
466
+ def discard(self):
467
+ self.fs.rm(self.path)
lib/python3.11/site-packages/fsspec/implementations/zip.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+
3
+ import fsspec
4
+ from fsspec.archive import AbstractArchiveFileSystem
5
+
6
+
7
+ class ZipFileSystem(AbstractArchiveFileSystem):
8
+ """Read/Write contents of ZIP archive as a file-system
9
+
10
+ Keeps file object open while instance lives.
11
+
12
+ This class is pickleable, but not necessarily thread-safe
13
+ """
14
+
15
+ root_marker = ""
16
+ protocol = "zip"
17
+ cachable = False
18
+
19
+ def __init__(
20
+ self,
21
+ fo="",
22
+ mode="r",
23
+ target_protocol=None,
24
+ target_options=None,
25
+ compression=zipfile.ZIP_STORED,
26
+ allowZip64=True,
27
+ compresslevel=None,
28
+ **kwargs,
29
+ ):
30
+ """
31
+ Parameters
32
+ ----------
33
+ fo: str or file-like
34
+ Contains ZIP, and must exist. If a str, will fetch file using
35
+ :meth:`~fsspec.open_files`, which must return one file exactly.
36
+ mode: str
37
+ Accept: "r", "w", "a"
38
+ target_protocol: str (optional)
39
+ If ``fo`` is a string, this value can be used to override the
40
+ FS protocol inferred from a URL
41
+ target_options: dict (optional)
42
+ Kwargs passed when instantiating the target FS, if ``fo`` is
43
+ a string.
44
+ compression, allowZip64, compresslevel: passed to ZipFile
45
+ Only relevant when creating a ZIP
46
+ """
47
+ super().__init__(self, **kwargs)
48
+ if mode not in set("rwa"):
49
+ raise ValueError(f"mode '{mode}' no understood")
50
+ self.mode = mode
51
+ if isinstance(fo, str):
52
+ if mode == "a":
53
+ m = "r+b"
54
+ else:
55
+ m = mode + "b"
56
+ fo = fsspec.open(
57
+ fo, mode=m, protocol=target_protocol, **(target_options or {})
58
+ )
59
+ self.of = fo
60
+ self.fo = fo.__enter__() # the whole instance is a context
61
+ self.zip = zipfile.ZipFile(
62
+ self.fo,
63
+ mode=mode,
64
+ compression=compression,
65
+ allowZip64=allowZip64,
66
+ compresslevel=compresslevel,
67
+ )
68
+ self.dir_cache = None
69
+
70
+ @classmethod
71
+ def _strip_protocol(cls, path):
72
+ # zip file paths are always relative to the archive root
73
+ return super()._strip_protocol(path).lstrip("/")
74
+
75
+ def __del__(self):
76
+ if hasattr(self, "zip"):
77
+ self.close()
78
+ del self.zip
79
+
80
+ def close(self):
81
+ """Commits any write changes to the file. Done on ``del`` too."""
82
+ self.zip.close()
83
+
84
+ def _get_dirs(self):
85
+ if self.dir_cache is None or self.mode in set("wa"):
86
+ # when writing, dir_cache is always in the ZipFile's attributes,
87
+ # not read from the file.
88
+ files = self.zip.infolist()
89
+ self.dir_cache = {
90
+ dirname.rstrip("/"): {
91
+ "name": dirname.rstrip("/"),
92
+ "size": 0,
93
+ "type": "directory",
94
+ }
95
+ for dirname in self._all_dirnames(self.zip.namelist())
96
+ }
97
+ for z in files:
98
+ f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
99
+ f.update(
100
+ {
101
+ "name": z.filename.rstrip("/"),
102
+ "size": z.file_size,
103
+ "type": ("directory" if z.is_dir() else "file"),
104
+ }
105
+ )
106
+ self.dir_cache[f["name"]] = f
107
+
108
+ def pipe_file(self, path, value, **kwargs):
109
+ # override upstream, because we know the exact file size in this case
110
+ self.zip.writestr(path, value, **kwargs)
111
+
112
+ def _open(
113
+ self,
114
+ path,
115
+ mode="rb",
116
+ block_size=None,
117
+ autocommit=True,
118
+ cache_options=None,
119
+ **kwargs,
120
+ ):
121
+ path = self._strip_protocol(path)
122
+ if "r" in mode and self.mode in set("wa"):
123
+ if self.exists(path):
124
+ raise OSError("ZipFS can only be open for reading or writing, not both")
125
+ raise FileNotFoundError(path)
126
+ if "r" in self.mode and "w" in mode:
127
+ raise OSError("ZipFS can only be open for reading or writing, not both")
128
+ out = self.zip.open(path, mode.strip("b"))
129
+ if "r" in mode:
130
+ info = self.info(path)
131
+ out.size = info["size"]
132
+ out.name = info["name"]
133
+ return out
lib/python3.11/site-packages/fsspec/mapping.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import array
2
+ import posixpath
3
+ import warnings
4
+ from collections.abc import MutableMapping
5
+ from functools import cached_property
6
+
7
+ from .core import url_to_fs
8
+
9
+
10
+ class FSMap(MutableMapping):
11
+ """Wrap a FileSystem instance as a mutable wrapping.
12
+
13
+ The keys of the mapping become files under the given root, and the
14
+ values (which must be bytes) the contents of those files.
15
+
16
+ Parameters
17
+ ----------
18
+ root: string
19
+ prefix for all the files
20
+ fs: FileSystem instance
21
+ check: bool (=True)
22
+ performs a touch at the location, to check for write access.
23
+
24
+ Examples
25
+ --------
26
+ >>> fs = FileSystem(**parameters) # doctest: +SKIP
27
+ >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
28
+ or, more likely
29
+ >>> d = fs.get_mapper('my-data/path/')
30
+
31
+ >>> d['loc1'] = b'Hello World' # doctest: +SKIP
32
+ >>> list(d.keys()) # doctest: +SKIP
33
+ ['loc1']
34
+ >>> d['loc1'] # doctest: +SKIP
35
+ b'Hello World'
36
+ """
37
+
38
+ def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
39
+ self.fs = fs
40
+ self.root = fs._strip_protocol(root).rstrip("/")
41
+ self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
42
+ if missing_exceptions is None:
43
+ missing_exceptions = (
44
+ FileNotFoundError,
45
+ IsADirectoryError,
46
+ NotADirectoryError,
47
+ )
48
+ self.missing_exceptions = missing_exceptions
49
+ self.check = check
50
+ self.create = create
51
+ if create:
52
+ if not self.fs.exists(root):
53
+ self.fs.mkdir(root)
54
+ if check:
55
+ if not self.fs.exists(root):
56
+ raise ValueError(
57
+ f"Path {root} does not exist. Create "
58
+ f" with the ``create=True`` keyword"
59
+ )
60
+ self.fs.touch(root + "/a")
61
+ self.fs.rm(root + "/a")
62
+
63
+ @cached_property
64
+ def dirfs(self):
65
+ """dirfs instance that can be used with the same keys as the mapper"""
66
+ from .implementations.dirfs import DirFileSystem
67
+
68
+ return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
69
+
70
+ def clear(self):
71
+ """Remove all keys below root - empties out mapping"""
72
+ try:
73
+ self.fs.rm(self.root, True)
74
+ self.fs.mkdir(self.root)
75
+ except: # noqa: E722
76
+ pass
77
+
78
+ def getitems(self, keys, on_error="raise"):
79
+ """Fetch multiple items from the store
80
+
81
+ If the backend is async-able, this might proceed concurrently
82
+
83
+ Parameters
84
+ ----------
85
+ keys: list(str)
86
+ They keys to be fetched
87
+ on_error : "raise", "omit", "return"
88
+ If raise, an underlying exception will be raised (converted to KeyError
89
+ if the type is in self.missing_exceptions); if omit, keys with exception
90
+ will simply not be included in the output; if "return", all keys are
91
+ included in the output, but the value will be bytes or an exception
92
+ instance.
93
+
94
+ Returns
95
+ -------
96
+ dict(key, bytes|exception)
97
+ """
98
+ keys2 = [self._key_to_str(k) for k in keys]
99
+ oe = on_error if on_error == "raise" else "return"
100
+ try:
101
+ out = self.fs.cat(keys2, on_error=oe)
102
+ if isinstance(out, bytes):
103
+ out = {keys2[0]: out}
104
+ except self.missing_exceptions as e:
105
+ raise KeyError from e
106
+ out = {
107
+ k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
108
+ for k, v in out.items()
109
+ }
110
+ return {
111
+ key: out[k2]
112
+ for key, k2 in zip(keys, keys2)
113
+ if on_error == "return" or not isinstance(out[k2], BaseException)
114
+ }
115
+
116
+ def setitems(self, values_dict):
117
+ """Set the values of multiple items in the store
118
+
119
+ Parameters
120
+ ----------
121
+ values_dict: dict(str, bytes)
122
+ """
123
+ values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
124
+ self.fs.pipe(values)
125
+
126
+ def delitems(self, keys):
127
+ """Remove multiple keys from the store"""
128
+ self.fs.rm([self._key_to_str(k) for k in keys])
129
+
130
+ def _key_to_str(self, key):
131
+ """Generate full path for the key"""
132
+ if not isinstance(key, str):
133
+ # raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
134
+ warnings.warn(
135
+ "from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
136
+ DeprecationWarning,
137
+ )
138
+ if isinstance(key, list):
139
+ key = tuple(key)
140
+ key = str(key)
141
+ return f"{self._root_key_to_str}{key}"
142
+
143
+ def _str_to_key(self, s):
144
+ """Strip path of to leave key name"""
145
+ return s[len(self.root) :].lstrip("/")
146
+
147
+ def __getitem__(self, key, default=None):
148
+ """Retrieve data"""
149
+ k = self._key_to_str(key)
150
+ try:
151
+ result = self.fs.cat(k)
152
+ except self.missing_exceptions:
153
+ if default is not None:
154
+ return default
155
+ raise KeyError(key)
156
+ return result
157
+
158
+ def pop(self, key, default=None):
159
+ """Pop data"""
160
+ result = self.__getitem__(key, default)
161
+ try:
162
+ del self[key]
163
+ except KeyError:
164
+ pass
165
+ return result
166
+
167
+ def __setitem__(self, key, value):
168
+ """Store value in key"""
169
+ key = self._key_to_str(key)
170
+ self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
171
+ self.fs.pipe_file(key, maybe_convert(value))
172
+
173
+ def __iter__(self):
174
+ return (self._str_to_key(x) for x in self.fs.find(self.root))
175
+
176
+ def __len__(self):
177
+ return len(self.fs.find(self.root))
178
+
179
+ def __delitem__(self, key):
180
+ """Remove key"""
181
+ try:
182
+ self.fs.rm(self._key_to_str(key))
183
+ except: # noqa: E722
184
+ raise KeyError
185
+
186
+ def __contains__(self, key):
187
+ """Does key exist in mapping?"""
188
+ path = self._key_to_str(key)
189
+ return self.fs.exists(path) and self.fs.isfile(path)
190
+
191
+ def __reduce__(self):
192
+ return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
193
+
194
+
195
+ def maybe_convert(value):
196
+ if isinstance(value, array.array) or hasattr(value, "__array__"):
197
+ # bytes-like things
198
+ if hasattr(value, "dtype") and value.dtype.kind in "Mm":
199
+ # The buffer interface doesn't support datetime64/timdelta64 numpy
200
+ # arrays
201
+ value = value.view("int64")
202
+ value = bytes(memoryview(value))
203
+ return value
204
+
205
+
206
+ def get_mapper(
207
+ url="",
208
+ check=False,
209
+ create=False,
210
+ missing_exceptions=None,
211
+ alternate_root=None,
212
+ **kwargs,
213
+ ):
214
+ """Create key-value interface for given URL and options
215
+
216
+ The URL will be of the form "protocol://location" and point to the root
217
+ of the mapper required. All keys will be file-names below this location,
218
+ and their values the contents of each key.
219
+
220
+ Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
221
+
222
+ Parameters
223
+ ----------
224
+ url: str
225
+ Root URL of mapping
226
+ check: bool
227
+ Whether to attempt to read from the location before instantiation, to
228
+ check that the mapping does exist
229
+ create: bool
230
+ Whether to make the directory corresponding to the root before
231
+ instantiating
232
+ missing_exceptions: None or tuple
233
+ If given, these exception types will be regarded as missing keys and
234
+ return KeyError when trying to read data. By default, you get
235
+ (FileNotFoundError, IsADirectoryError, NotADirectoryError)
236
+ alternate_root: None or str
237
+ In cases of complex URLs, the parser may fail to pick the correct part
238
+ for the mapper root, so this arg can override
239
+
240
+ Returns
241
+ -------
242
+ ``FSMap`` instance, the dict-like key-value store.
243
+ """
244
+ # Removing protocol here - could defer to each open() on the backend
245
+ fs, urlpath = url_to_fs(url, **kwargs)
246
+ root = alternate_root if alternate_root is not None else urlpath
247
+ return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)