8643e0bff997343019440dbb643d796bcec3eb5d247e4cbf77173c5941ca163f
Browse files- lib/python3.11/site-packages/fsspec/fuse.py +324 -0
- lib/python3.11/site-packages/fsspec/generic.py +403 -0
- lib/python3.11/site-packages/fsspec/gui.py +413 -0
- lib/python3.11/site-packages/fsspec/implementations/__init__.py +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc +0 -0
- lib/python3.11/site-packages/fsspec/implementations/arrow.py +297 -0
- lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py +80 -0
- lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py +232 -0
- lib/python3.11/site-packages/fsspec/implementations/cached.py +864 -0
- lib/python3.11/site-packages/fsspec/implementations/dask.py +152 -0
- lib/python3.11/site-packages/fsspec/implementations/data.py +48 -0
- lib/python3.11/site-packages/fsspec/implementations/dbfs.py +457 -0
- lib/python3.11/site-packages/fsspec/implementations/dirfs.py +358 -0
- lib/python3.11/site-packages/fsspec/implementations/ftp.py +380 -0
- lib/python3.11/site-packages/fsspec/implementations/git.py +127 -0
- lib/python3.11/site-packages/fsspec/implementations/github.py +215 -0
- lib/python3.11/site-packages/fsspec/implementations/http.py +864 -0
- lib/python3.11/site-packages/fsspec/implementations/jupyter.py +124 -0
- lib/python3.11/site-packages/fsspec/implementations/libarchive.py +213 -0
- lib/python3.11/site-packages/fsspec/implementations/local.py +414 -0
- lib/python3.11/site-packages/fsspec/implementations/memory.py +292 -0
- lib/python3.11/site-packages/fsspec/implementations/reference.py +1122 -0
- lib/python3.11/site-packages/fsspec/implementations/sftp.py +180 -0
- lib/python3.11/site-packages/fsspec/implementations/smb.py +324 -0
- lib/python3.11/site-packages/fsspec/implementations/tar.py +124 -0
- lib/python3.11/site-packages/fsspec/implementations/webhdfs.py +467 -0
- lib/python3.11/site-packages/fsspec/implementations/zip.py +133 -0
- lib/python3.11/site-packages/fsspec/mapping.py +247 -0
lib/python3.11/site-packages/fsspec/fuse.py
ADDED
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import stat
|
5 |
+
import threading
|
6 |
+
import time
|
7 |
+
from errno import EIO, ENOENT
|
8 |
+
|
9 |
+
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
|
10 |
+
|
11 |
+
from fsspec import __version__
|
12 |
+
from fsspec.core import url_to_fs
|
13 |
+
|
14 |
+
logger = logging.getLogger("fsspec.fuse")
|
15 |
+
|
16 |
+
|
17 |
+
class FUSEr(Operations):
|
18 |
+
def __init__(self, fs, path, ready_file=False):
|
19 |
+
self.fs = fs
|
20 |
+
self.cache = {}
|
21 |
+
self.root = path.rstrip("/") + "/"
|
22 |
+
self.counter = 0
|
23 |
+
logger.info("Starting FUSE at %s", path)
|
24 |
+
self._ready_file = ready_file
|
25 |
+
|
26 |
+
def getattr(self, path, fh=None):
|
27 |
+
logger.debug("getattr %s", path)
|
28 |
+
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
29 |
+
return {"type": "file", "st_size": 5}
|
30 |
+
|
31 |
+
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
|
32 |
+
try:
|
33 |
+
info = self.fs.info(path)
|
34 |
+
except FileNotFoundError:
|
35 |
+
raise FuseOSError(ENOENT)
|
36 |
+
|
37 |
+
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
|
38 |
+
perm = info.get("mode", 0o777)
|
39 |
+
|
40 |
+
if info["type"] != "file":
|
41 |
+
data["st_mode"] = stat.S_IFDIR | perm
|
42 |
+
data["st_size"] = 0
|
43 |
+
data["st_blksize"] = 0
|
44 |
+
else:
|
45 |
+
data["st_mode"] = stat.S_IFREG | perm
|
46 |
+
data["st_size"] = info["size"]
|
47 |
+
data["st_blksize"] = 5 * 2**20
|
48 |
+
data["st_nlink"] = 1
|
49 |
+
data["st_atime"] = info["atime"] if "atime" in info else time.time()
|
50 |
+
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
|
51 |
+
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
|
52 |
+
return data
|
53 |
+
|
54 |
+
def readdir(self, path, fh):
|
55 |
+
logger.debug("readdir %s", path)
|
56 |
+
path = "".join([self.root, path.lstrip("/")])
|
57 |
+
files = self.fs.ls(path, False)
|
58 |
+
files = [os.path.basename(f.rstrip("/")) for f in files]
|
59 |
+
return [".", ".."] + files
|
60 |
+
|
61 |
+
def mkdir(self, path, mode):
|
62 |
+
path = "".join([self.root, path.lstrip("/")])
|
63 |
+
self.fs.mkdir(path)
|
64 |
+
return 0
|
65 |
+
|
66 |
+
def rmdir(self, path):
|
67 |
+
path = "".join([self.root, path.lstrip("/")])
|
68 |
+
self.fs.rmdir(path)
|
69 |
+
return 0
|
70 |
+
|
71 |
+
def read(self, path, size, offset, fh):
|
72 |
+
logger.debug("read %s", (path, size, offset))
|
73 |
+
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
74 |
+
# status indicator
|
75 |
+
return b"ready"
|
76 |
+
|
77 |
+
f = self.cache[fh]
|
78 |
+
f.seek(offset)
|
79 |
+
out = f.read(size)
|
80 |
+
return out
|
81 |
+
|
82 |
+
def write(self, path, data, offset, fh):
|
83 |
+
logger.debug("write %s", (path, offset))
|
84 |
+
f = self.cache[fh]
|
85 |
+
f.seek(offset)
|
86 |
+
f.write(data)
|
87 |
+
return len(data)
|
88 |
+
|
89 |
+
def create(self, path, flags, fi=None):
|
90 |
+
logger.debug("create %s", (path, flags))
|
91 |
+
fn = "".join([self.root, path.lstrip("/")])
|
92 |
+
self.fs.touch(fn) # OS will want to get attributes immediately
|
93 |
+
f = self.fs.open(fn, "wb")
|
94 |
+
self.cache[self.counter] = f
|
95 |
+
self.counter += 1
|
96 |
+
return self.counter - 1
|
97 |
+
|
98 |
+
def open(self, path, flags):
|
99 |
+
logger.debug("open %s", (path, flags))
|
100 |
+
fn = "".join([self.root, path.lstrip("/")])
|
101 |
+
if flags % 2 == 0:
|
102 |
+
# read
|
103 |
+
mode = "rb"
|
104 |
+
else:
|
105 |
+
# write/create
|
106 |
+
mode = "wb"
|
107 |
+
self.cache[self.counter] = self.fs.open(fn, mode)
|
108 |
+
self.counter += 1
|
109 |
+
return self.counter - 1
|
110 |
+
|
111 |
+
def truncate(self, path, length, fh=None):
|
112 |
+
fn = "".join([self.root, path.lstrip("/")])
|
113 |
+
if length != 0:
|
114 |
+
raise NotImplementedError
|
115 |
+
# maybe should be no-op since open with write sets size to zero anyway
|
116 |
+
self.fs.touch(fn)
|
117 |
+
|
118 |
+
def unlink(self, path):
|
119 |
+
fn = "".join([self.root, path.lstrip("/")])
|
120 |
+
try:
|
121 |
+
self.fs.rm(fn, False)
|
122 |
+
except (OSError, FileNotFoundError):
|
123 |
+
raise FuseOSError(EIO)
|
124 |
+
|
125 |
+
def release(self, path, fh):
|
126 |
+
try:
|
127 |
+
if fh in self.cache:
|
128 |
+
f = self.cache[fh]
|
129 |
+
f.close()
|
130 |
+
self.cache.pop(fh)
|
131 |
+
except Exception as e:
|
132 |
+
print(e)
|
133 |
+
return 0
|
134 |
+
|
135 |
+
def chmod(self, path, mode):
|
136 |
+
if hasattr(self.fs, "chmod"):
|
137 |
+
path = "".join([self.root, path.lstrip("/")])
|
138 |
+
return self.fs.chmod(path, mode)
|
139 |
+
raise NotImplementedError
|
140 |
+
|
141 |
+
|
142 |
+
def run(
|
143 |
+
fs,
|
144 |
+
path,
|
145 |
+
mount_point,
|
146 |
+
foreground=True,
|
147 |
+
threads=False,
|
148 |
+
ready_file=False,
|
149 |
+
ops_class=FUSEr,
|
150 |
+
):
|
151 |
+
"""Mount stuff in a local directory
|
152 |
+
|
153 |
+
This uses fusepy to make it appear as if a given path on an fsspec
|
154 |
+
instance is in fact resident within the local file-system.
|
155 |
+
|
156 |
+
This requires that fusepy by installed, and that FUSE be available on
|
157 |
+
the system (typically requiring a package to be installed with
|
158 |
+
apt, yum, brew, etc.).
|
159 |
+
|
160 |
+
Parameters
|
161 |
+
----------
|
162 |
+
fs: file-system instance
|
163 |
+
From one of the compatible implementations
|
164 |
+
path: str
|
165 |
+
Location on that file-system to regard as the root directory to
|
166 |
+
mount. Note that you typically should include the terminating "/"
|
167 |
+
character.
|
168 |
+
mount_point: str
|
169 |
+
An empty directory on the local file-system where the contents of
|
170 |
+
the remote path will appear.
|
171 |
+
foreground: bool
|
172 |
+
Whether or not calling this function will block. Operation will
|
173 |
+
typically be more stable if True.
|
174 |
+
threads: bool
|
175 |
+
Whether or not to create threads when responding to file operations
|
176 |
+
within the mounter directory. Operation will typically be more
|
177 |
+
stable if False.
|
178 |
+
ready_file: bool
|
179 |
+
Whether the FUSE process is ready. The ``.fuse_ready`` file will
|
180 |
+
exist in the ``mount_point`` directory if True. Debugging purpose.
|
181 |
+
ops_class: FUSEr or Subclass of FUSEr
|
182 |
+
To override the default behavior of FUSEr. For Example, logging
|
183 |
+
to file.
|
184 |
+
|
185 |
+
"""
|
186 |
+
func = lambda: FUSE(
|
187 |
+
ops_class(fs, path, ready_file=ready_file),
|
188 |
+
mount_point,
|
189 |
+
nothreads=not threads,
|
190 |
+
foreground=foreground,
|
191 |
+
)
|
192 |
+
if not foreground:
|
193 |
+
th = threading.Thread(target=func)
|
194 |
+
th.daemon = True
|
195 |
+
th.start()
|
196 |
+
return th
|
197 |
+
else: # pragma: no cover
|
198 |
+
try:
|
199 |
+
func()
|
200 |
+
except KeyboardInterrupt:
|
201 |
+
pass
|
202 |
+
|
203 |
+
|
204 |
+
def main(args):
|
205 |
+
"""Mount filesystem from chained URL to MOUNT_POINT.
|
206 |
+
|
207 |
+
Examples:
|
208 |
+
|
209 |
+
python3 -m fsspec.fuse memory /usr/share /tmp/mem
|
210 |
+
|
211 |
+
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
|
212 |
+
-l /tmp/fsspecfuse.log
|
213 |
+
|
214 |
+
You can also mount chained-URLs and use special settings:
|
215 |
+
|
216 |
+
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
|
217 |
+
/ /tmp/zip \\
|
218 |
+
-o 'filecache-cache_storage=/tmp/simplecache'
|
219 |
+
|
220 |
+
You can specify the type of the setting by using `[int]` or `[bool]`,
|
221 |
+
(`true`, `yes`, `1` represents the Boolean value `True`):
|
222 |
+
|
223 |
+
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
|
224 |
+
/historic/packages/RPMS /tmp/ftp \\
|
225 |
+
-o 'simplecache-cache_storage=/tmp/simplecache' \\
|
226 |
+
-o 'simplecache-check_files=false[bool]' \\
|
227 |
+
-o 'ftp-listings_expiry_time=60[int]' \\
|
228 |
+
-o 'ftp-username=anonymous' \\
|
229 |
+
-o 'ftp-password=xieyanbo'
|
230 |
+
"""
|
231 |
+
|
232 |
+
class RawDescriptionArgumentParser(argparse.ArgumentParser):
|
233 |
+
def format_help(self):
|
234 |
+
usage = super().format_help()
|
235 |
+
parts = usage.split("\n\n")
|
236 |
+
parts[1] = self.description.rstrip()
|
237 |
+
return "\n\n".join(parts)
|
238 |
+
|
239 |
+
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
|
240 |
+
parser.add_argument("--version", action="version", version=__version__)
|
241 |
+
parser.add_argument("url", type=str, help="fs url")
|
242 |
+
parser.add_argument("source_path", type=str, help="source directory in fs")
|
243 |
+
parser.add_argument("mount_point", type=str, help="local directory")
|
244 |
+
parser.add_argument(
|
245 |
+
"-o",
|
246 |
+
"--option",
|
247 |
+
action="append",
|
248 |
+
help="Any options of protocol included in the chained URL",
|
249 |
+
)
|
250 |
+
parser.add_argument(
|
251 |
+
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
|
252 |
+
)
|
253 |
+
parser.add_argument(
|
254 |
+
"-f",
|
255 |
+
"--foreground",
|
256 |
+
action="store_false",
|
257 |
+
help="Running in foreground or not (Default: False)",
|
258 |
+
)
|
259 |
+
parser.add_argument(
|
260 |
+
"-t",
|
261 |
+
"--threads",
|
262 |
+
action="store_false",
|
263 |
+
help="Running with threads support (Default: False)",
|
264 |
+
)
|
265 |
+
parser.add_argument(
|
266 |
+
"-r",
|
267 |
+
"--ready-file",
|
268 |
+
action="store_false",
|
269 |
+
help="The `.fuse_ready` file will exist after FUSE is ready. "
|
270 |
+
"(Debugging purpose, Default: False)",
|
271 |
+
)
|
272 |
+
args = parser.parse_args(args)
|
273 |
+
|
274 |
+
kwargs = {}
|
275 |
+
for item in args.option or []:
|
276 |
+
key, sep, value = item.partition("=")
|
277 |
+
if not sep:
|
278 |
+
parser.error(message=f"Wrong option: {item!r}")
|
279 |
+
val = value.lower()
|
280 |
+
if val.endswith("[int]"):
|
281 |
+
value = int(value[: -len("[int]")])
|
282 |
+
elif val.endswith("[bool]"):
|
283 |
+
value = val[: -len("[bool]")] in ["1", "yes", "true"]
|
284 |
+
|
285 |
+
if "-" in key:
|
286 |
+
fs_name, setting_name = key.split("-", 1)
|
287 |
+
if fs_name in kwargs:
|
288 |
+
kwargs[fs_name][setting_name] = value
|
289 |
+
else:
|
290 |
+
kwargs[fs_name] = {setting_name: value}
|
291 |
+
else:
|
292 |
+
kwargs[key] = value
|
293 |
+
|
294 |
+
if args.log_file:
|
295 |
+
logging.basicConfig(
|
296 |
+
level=logging.DEBUG,
|
297 |
+
filename=args.log_file,
|
298 |
+
format="%(asctime)s %(message)s",
|
299 |
+
)
|
300 |
+
|
301 |
+
class LoggingFUSEr(FUSEr, LoggingMixIn):
|
302 |
+
pass
|
303 |
+
|
304 |
+
fuser = LoggingFUSEr
|
305 |
+
else:
|
306 |
+
fuser = FUSEr
|
307 |
+
|
308 |
+
fs, url_path = url_to_fs(args.url, **kwargs)
|
309 |
+
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
|
310 |
+
run(
|
311 |
+
fs,
|
312 |
+
args.source_path,
|
313 |
+
args.mount_point,
|
314 |
+
foreground=args.foreground,
|
315 |
+
threads=args.threads,
|
316 |
+
ready_file=args.ready_file,
|
317 |
+
ops_class=fuser,
|
318 |
+
)
|
319 |
+
|
320 |
+
|
321 |
+
if __name__ == "__main__":
|
322 |
+
import sys
|
323 |
+
|
324 |
+
main(sys.argv[1:])
|
lib/python3.11/site-packages/fsspec/generic.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import inspect
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import uuid
|
8 |
+
from typing import Optional
|
9 |
+
|
10 |
+
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
|
11 |
+
from .callbacks import _DEFAULT_CALLBACK
|
12 |
+
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
|
13 |
+
|
14 |
+
_generic_fs = {}
|
15 |
+
logger = logging.getLogger("fsspec.generic")
|
16 |
+
|
17 |
+
|
18 |
+
def set_generic_fs(protocol, **storage_options):
|
19 |
+
_generic_fs[protocol] = filesystem(protocol, **storage_options)
|
20 |
+
|
21 |
+
|
22 |
+
default_method = "default"
|
23 |
+
|
24 |
+
|
25 |
+
def _resolve_fs(url, method=None, protocol=None, storage_options=None):
|
26 |
+
"""Pick instance of backend FS"""
|
27 |
+
method = method or default_method
|
28 |
+
protocol = protocol or split_protocol(url)[0]
|
29 |
+
storage_options = storage_options or {}
|
30 |
+
if method == "default":
|
31 |
+
return filesystem(protocol)
|
32 |
+
if method == "generic":
|
33 |
+
return _generic_fs[protocol]
|
34 |
+
if method == "current":
|
35 |
+
cls = get_filesystem_class(protocol)
|
36 |
+
return cls.current()
|
37 |
+
if method == "options":
|
38 |
+
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
|
39 |
+
return fs
|
40 |
+
raise ValueError(f"Unknown FS resolution method: {method}")
|
41 |
+
|
42 |
+
|
43 |
+
def rsync(
|
44 |
+
source,
|
45 |
+
destination,
|
46 |
+
delete_missing=False,
|
47 |
+
source_field="size",
|
48 |
+
dest_field="size",
|
49 |
+
update_cond="different",
|
50 |
+
inst_kwargs=None,
|
51 |
+
fs=None,
|
52 |
+
**kwargs,
|
53 |
+
):
|
54 |
+
"""Sync files between two directory trees
|
55 |
+
|
56 |
+
(experimental)
|
57 |
+
|
58 |
+
Parameters
|
59 |
+
----------
|
60 |
+
source: str
|
61 |
+
Root of the directory tree to take files from. This must be a directory, but
|
62 |
+
do not include any terminating "/" character
|
63 |
+
destination: str
|
64 |
+
Root path to copy into. The contents of this location should be
|
65 |
+
identical to the contents of ``source`` when done. This will be made a
|
66 |
+
directory, and the terminal "/" should not be included.
|
67 |
+
delete_missing: bool
|
68 |
+
If there are paths in the destination that don't exist in the
|
69 |
+
source and this is True, delete them. Otherwise, leave them alone.
|
70 |
+
source_field: str | callable
|
71 |
+
If ``update_field`` is "different", this is the key in the info
|
72 |
+
of source files to consider for difference. Maybe a function of the
|
73 |
+
info dict.
|
74 |
+
dest_field: str | callable
|
75 |
+
If ``update_field`` is "different", this is the key in the info
|
76 |
+
of destination files to consider for difference. May be a function of
|
77 |
+
the info dict.
|
78 |
+
update_cond: "different"|"always"|"never"
|
79 |
+
If "always", every file is copied, regardless of whether it exists in
|
80 |
+
the destination. If "never", files that exist in the destination are
|
81 |
+
not copied again. If "different" (default), only copy if the info
|
82 |
+
fields given by ``source_field`` and ``dest_field`` (usually "size")
|
83 |
+
are different. Other comparisons may be added in the future.
|
84 |
+
inst_kwargs: dict|None
|
85 |
+
If ``fs`` is None, use this set of keyword arguments to make a
|
86 |
+
GenericFileSystem instance
|
87 |
+
fs: GenericFileSystem|None
|
88 |
+
Instance to use if explicitly given. The instance defines how to
|
89 |
+
to make downstream file system instances from paths.
|
90 |
+
"""
|
91 |
+
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
|
92 |
+
source = fs._strip_protocol(source)
|
93 |
+
destination = fs._strip_protocol(destination)
|
94 |
+
allfiles = fs.find(source, withdirs=True, detail=True)
|
95 |
+
if not fs.isdir(source):
|
96 |
+
raise ValueError("Can only rsync on a directory")
|
97 |
+
otherfiles = fs.find(destination, withdirs=True, detail=True)
|
98 |
+
dirs = [
|
99 |
+
a
|
100 |
+
for a, v in allfiles.items()
|
101 |
+
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
|
102 |
+
]
|
103 |
+
logger.debug(f"{len(dirs)} directories to create")
|
104 |
+
if dirs:
|
105 |
+
fs.make_many_dirs(
|
106 |
+
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
|
107 |
+
)
|
108 |
+
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
|
109 |
+
logger.debug(f"{len(allfiles)} files to consider for copy")
|
110 |
+
to_delete = [
|
111 |
+
o
|
112 |
+
for o, v in otherfiles.items()
|
113 |
+
if o.replace(destination, source) not in allfiles and v["type"] == "file"
|
114 |
+
]
|
115 |
+
for k, v in allfiles.copy().items():
|
116 |
+
otherfile = k.replace(source, destination)
|
117 |
+
if otherfile in otherfiles:
|
118 |
+
if update_cond == "always":
|
119 |
+
allfiles[k] = otherfile
|
120 |
+
elif update_cond == "different":
|
121 |
+
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
122 |
+
v2 = otherfiles[otherfile]
|
123 |
+
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
|
124 |
+
if inf1 != inf2:
|
125 |
+
# details mismatch, make copy
|
126 |
+
allfiles[k] = otherfile
|
127 |
+
else:
|
128 |
+
# details match, don't copy
|
129 |
+
allfiles.pop(k)
|
130 |
+
else:
|
131 |
+
# file not in target yet
|
132 |
+
allfiles[k] = otherfile
|
133 |
+
logger.debug(f"{len(allfiles)} files to copy")
|
134 |
+
if allfiles:
|
135 |
+
source_files, target_files = zip(*allfiles.items())
|
136 |
+
fs.cp(source_files, target_files, **kwargs)
|
137 |
+
logger.debug(f"{len(to_delete)} files to delete")
|
138 |
+
if delete_missing:
|
139 |
+
fs.rm(to_delete)
|
140 |
+
|
141 |
+
|
142 |
+
class GenericFileSystem(AsyncFileSystem):
|
143 |
+
"""Wrapper over all other FS types
|
144 |
+
|
145 |
+
<experimental!>
|
146 |
+
|
147 |
+
This implementation is a single unified interface to be able to run FS operations
|
148 |
+
over generic URLs, and dispatch to the specific implementations using the URL
|
149 |
+
protocol prefix.
|
150 |
+
|
151 |
+
Note: instances of this FS are always async, even if you never use it with any async
|
152 |
+
backend.
|
153 |
+
"""
|
154 |
+
|
155 |
+
protocol = "generic" # there is no real reason to ever use a protocol with this FS
|
156 |
+
|
157 |
+
def __init__(self, default_method="default", **kwargs):
|
158 |
+
"""
|
159 |
+
|
160 |
+
Parameters
|
161 |
+
----------
|
162 |
+
default_method: str (optional)
|
163 |
+
Defines how to configure backend FS instances. Options are:
|
164 |
+
- "default": instantiate like FSClass(), with no
|
165 |
+
extra arguments; this is the default instance of that FS, and can be
|
166 |
+
configured via the config system
|
167 |
+
- "generic": takes instances from the `_generic_fs` dict in this module,
|
168 |
+
which you must populate before use. Keys are by protocol
|
169 |
+
- "current": takes the most recently instantiated version of each FS
|
170 |
+
"""
|
171 |
+
self.method = default_method
|
172 |
+
super().__init__(**kwargs)
|
173 |
+
|
174 |
+
def _parent(self, path):
|
175 |
+
fs = _resolve_fs(path, self.method)
|
176 |
+
return fs.unstrip_protocol(fs._parent(path))
|
177 |
+
|
178 |
+
def _strip_protocol(self, path):
|
179 |
+
# normalization only
|
180 |
+
fs = _resolve_fs(path, self.method)
|
181 |
+
return fs.unstrip_protocol(fs._strip_protocol(path))
|
182 |
+
|
183 |
+
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
184 |
+
fs = _resolve_fs(path, self.method)
|
185 |
+
if fs.async_impl:
|
186 |
+
out = await fs._find(
|
187 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
188 |
+
)
|
189 |
+
else:
|
190 |
+
out = fs.find(
|
191 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
192 |
+
)
|
193 |
+
result = {}
|
194 |
+
for k, v in out.items():
|
195 |
+
name = fs.unstrip_protocol(k)
|
196 |
+
v["name"] = name
|
197 |
+
result[name] = v
|
198 |
+
if detail:
|
199 |
+
return result
|
200 |
+
return list(result)
|
201 |
+
|
202 |
+
async def _info(self, url, **kwargs):
|
203 |
+
fs = _resolve_fs(url, self.method)
|
204 |
+
if fs.async_impl:
|
205 |
+
out = await fs._info(url, **kwargs)
|
206 |
+
else:
|
207 |
+
out = fs.info(url, **kwargs)
|
208 |
+
out["name"] = fs.unstrip_protocol(out["name"])
|
209 |
+
return out
|
210 |
+
|
211 |
+
async def _ls(
|
212 |
+
self,
|
213 |
+
url,
|
214 |
+
detail=True,
|
215 |
+
**kwargs,
|
216 |
+
):
|
217 |
+
fs = _resolve_fs(url, self.method)
|
218 |
+
if fs.async_impl:
|
219 |
+
out = await fs._ls(url, detail=True, **kwargs)
|
220 |
+
else:
|
221 |
+
out = fs.ls(url, detail=True, **kwargs)
|
222 |
+
for o in out:
|
223 |
+
o["name"] = fs.unstrip_protocol(o["name"])
|
224 |
+
if detail:
|
225 |
+
return out
|
226 |
+
else:
|
227 |
+
return [o["name"] for o in out]
|
228 |
+
|
229 |
+
async def _cat_file(
|
230 |
+
self,
|
231 |
+
url,
|
232 |
+
**kwargs,
|
233 |
+
):
|
234 |
+
fs = _resolve_fs(url, self.method)
|
235 |
+
if fs.async_impl:
|
236 |
+
return await fs._cat_file(url, **kwargs)
|
237 |
+
else:
|
238 |
+
return fs.cat_file(url, **kwargs)
|
239 |
+
|
240 |
+
async def _pipe_file(
|
241 |
+
self,
|
242 |
+
path,
|
243 |
+
value,
|
244 |
+
**kwargs,
|
245 |
+
):
|
246 |
+
fs = _resolve_fs(path, self.method)
|
247 |
+
if fs.async_impl:
|
248 |
+
return await fs._pipe_file(path, value, **kwargs)
|
249 |
+
else:
|
250 |
+
return fs.pipe_file(path, value, **kwargs)
|
251 |
+
|
252 |
+
async def _rm(self, url, **kwargs):
|
253 |
+
urls = url
|
254 |
+
if isinstance(urls, str):
|
255 |
+
urls = [urls]
|
256 |
+
fs = _resolve_fs(urls[0], self.method)
|
257 |
+
if fs.async_impl:
|
258 |
+
await fs._rm(urls, **kwargs)
|
259 |
+
else:
|
260 |
+
fs.rm(url, **kwargs)
|
261 |
+
|
262 |
+
async def _makedirs(self, path, exist_ok=False):
|
263 |
+
logger.debug("Make dir %s", path)
|
264 |
+
fs = _resolve_fs(path, self.method)
|
265 |
+
if fs.async_impl:
|
266 |
+
await fs._makedirs(path, exist_ok=exist_ok)
|
267 |
+
else:
|
268 |
+
fs.makedirs(path, exist_ok=exist_ok)
|
269 |
+
|
270 |
+
def rsync(self, source, destination, **kwargs):
|
271 |
+
"""Sync files between two directory trees
|
272 |
+
|
273 |
+
See `func:rsync` for more details.
|
274 |
+
"""
|
275 |
+
rsync(source, destination, fs=self, **kwargs)
|
276 |
+
|
277 |
+
async def _cp_file(
|
278 |
+
self,
|
279 |
+
url,
|
280 |
+
url2,
|
281 |
+
blocksize=2**20,
|
282 |
+
callback=_DEFAULT_CALLBACK,
|
283 |
+
**kwargs,
|
284 |
+
):
|
285 |
+
fs = _resolve_fs(url, self.method)
|
286 |
+
fs2 = _resolve_fs(url2, self.method)
|
287 |
+
if fs is fs2:
|
288 |
+
# pure remote
|
289 |
+
if fs.async_impl:
|
290 |
+
return await fs._cp_file(url, url2, **kwargs)
|
291 |
+
else:
|
292 |
+
return fs.cp_file(url, url2, **kwargs)
|
293 |
+
kw = {"blocksize": 0, "cache_type": "none"}
|
294 |
+
try:
|
295 |
+
f1 = (
|
296 |
+
await fs.open_async(url, "rb")
|
297 |
+
if hasattr(fs, "open_async")
|
298 |
+
else fs.open(url, "rb", **kw)
|
299 |
+
)
|
300 |
+
callback.set_size(await maybe_await(f1.size))
|
301 |
+
f2 = (
|
302 |
+
await fs2.open_async(url2, "wb")
|
303 |
+
if hasattr(fs2, "open_async")
|
304 |
+
else fs2.open(url2, "wb", **kw)
|
305 |
+
)
|
306 |
+
while f1.size is None or f2.tell() < f1.size:
|
307 |
+
data = await maybe_await(f1.read(blocksize))
|
308 |
+
if f1.size is None and not data:
|
309 |
+
break
|
310 |
+
await maybe_await(f2.write(data))
|
311 |
+
callback.absolute_update(f2.tell())
|
312 |
+
finally:
|
313 |
+
try:
|
314 |
+
await maybe_await(f2.close())
|
315 |
+
await maybe_await(f1.close())
|
316 |
+
except NameError:
|
317 |
+
# fail while opening f1 or f2
|
318 |
+
pass
|
319 |
+
|
320 |
+
async def _make_many_dirs(self, urls, exist_ok=True):
|
321 |
+
fs = _resolve_fs(urls[0], self.method)
|
322 |
+
if fs.async_impl:
|
323 |
+
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
|
324 |
+
await _run_coros_in_chunks(coros)
|
325 |
+
else:
|
326 |
+
for u in urls:
|
327 |
+
fs.makedirs(u, exist_ok=exist_ok)
|
328 |
+
|
329 |
+
make_many_dirs = sync_wrapper(_make_many_dirs)
|
330 |
+
|
331 |
+
async def _copy(
|
332 |
+
self,
|
333 |
+
path1: list[str],
|
334 |
+
path2: list[str],
|
335 |
+
recursive: bool = False,
|
336 |
+
on_error: str = "ignore",
|
337 |
+
maxdepth: Optional[int] = None,
|
338 |
+
batch_size: Optional[int] = None,
|
339 |
+
tempdir: Optional[str] = None,
|
340 |
+
**kwargs,
|
341 |
+
):
|
342 |
+
if recursive:
|
343 |
+
raise NotImplementedError
|
344 |
+
fs = _resolve_fs(path1[0], self.method)
|
345 |
+
fs2 = _resolve_fs(path2[0], self.method)
|
346 |
+
# not expanding paths atm., assume call is from rsync()
|
347 |
+
if fs is fs2:
|
348 |
+
# pure remote
|
349 |
+
if fs.async_impl:
|
350 |
+
return await fs._copy(path1, path2, **kwargs)
|
351 |
+
else:
|
352 |
+
return fs.copy(path1, path2, **kwargs)
|
353 |
+
await copy_file_op(
|
354 |
+
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
|
355 |
+
)
|
356 |
+
|
357 |
+
|
358 |
+
async def copy_file_op(
|
359 |
+
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
|
360 |
+
):
|
361 |
+
import tempfile
|
362 |
+
|
363 |
+
tempdir = tempdir or tempfile.mkdtemp()
|
364 |
+
try:
|
365 |
+
coros = [
|
366 |
+
_copy_file_op(
|
367 |
+
fs1,
|
368 |
+
u1,
|
369 |
+
fs2,
|
370 |
+
u2,
|
371 |
+
os.path.join(tempdir, uuid.uuid4().hex),
|
372 |
+
on_error=on_error,
|
373 |
+
)
|
374 |
+
for u1, u2 in zip(url1, url2)
|
375 |
+
]
|
376 |
+
await _run_coros_in_chunks(coros, batch_size=batch_size)
|
377 |
+
finally:
|
378 |
+
shutil.rmtree(tempdir)
|
379 |
+
|
380 |
+
|
381 |
+
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
|
382 |
+
ex = () if on_error == "raise" else Exception
|
383 |
+
logger.debug("Copy %s -> %s", url1, url2)
|
384 |
+
try:
|
385 |
+
if fs1.async_impl:
|
386 |
+
await fs1._get_file(url1, local)
|
387 |
+
else:
|
388 |
+
fs1.get_file(url1, local)
|
389 |
+
if fs2.async_impl:
|
390 |
+
await fs2._put_file(local, url2)
|
391 |
+
else:
|
392 |
+
fs2.put_file(local, url2)
|
393 |
+
os.unlink(local)
|
394 |
+
logger.debug("Copy %s -> %s; done", url1, url2)
|
395 |
+
except ex as e:
|
396 |
+
logger.debug("ignoring cp exception for %s: %s", url1, e)
|
397 |
+
|
398 |
+
|
399 |
+
async def maybe_await(cor):
|
400 |
+
if inspect.iscoroutine(cor):
|
401 |
+
return await cor
|
402 |
+
else:
|
403 |
+
return cor
|
lib/python3.11/site-packages/fsspec/gui.py
ADDED
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import contextlib
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from typing import ClassVar, Sequence
|
7 |
+
|
8 |
+
import panel as pn
|
9 |
+
|
10 |
+
from .core import OpenFile, get_filesystem_class, split_protocol
|
11 |
+
from .registry import known_implementations
|
12 |
+
|
13 |
+
pn.extension()
|
14 |
+
logger = logging.getLogger("fsspec.gui")
|
15 |
+
|
16 |
+
|
17 |
+
class SigSlot:
|
18 |
+
"""Signal-slot mixin, for Panel event passing
|
19 |
+
|
20 |
+
Include this class in a widget manager's superclasses to be able to
|
21 |
+
register events and callbacks on Panel widgets managed by that class.
|
22 |
+
|
23 |
+
The method ``_register`` should be called as widgets are added, and external
|
24 |
+
code should call ``connect`` to associate callbacks.
|
25 |
+
|
26 |
+
By default, all signals emit a DEBUG logging statement.
|
27 |
+
"""
|
28 |
+
|
29 |
+
# names of signals that this class may emit each of which must be
|
30 |
+
# set by _register for any new instance
|
31 |
+
signals: ClassVar[Sequence[str]] = []
|
32 |
+
# names of actions that this class may respond to
|
33 |
+
slots: ClassVar[Sequence[str]] = []
|
34 |
+
|
35 |
+
# each of which must be a method name
|
36 |
+
|
37 |
+
def __init__(self):
|
38 |
+
self._ignoring_events = False
|
39 |
+
self._sigs = {}
|
40 |
+
self._map = {}
|
41 |
+
self._setup()
|
42 |
+
|
43 |
+
def _setup(self):
|
44 |
+
"""Create GUI elements and register signals"""
|
45 |
+
self.panel = pn.pane.PaneBase()
|
46 |
+
# no signals to set up in the base class
|
47 |
+
|
48 |
+
def _register(
|
49 |
+
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
|
50 |
+
):
|
51 |
+
"""Watch the given attribute of a widget and assign it a named event
|
52 |
+
|
53 |
+
This is normally called at the time a widget is instantiated, in the
|
54 |
+
class which owns it.
|
55 |
+
|
56 |
+
Parameters
|
57 |
+
----------
|
58 |
+
widget : pn.layout.Panel or None
|
59 |
+
Widget to watch. If None, an anonymous signal not associated with
|
60 |
+
any widget.
|
61 |
+
name : str
|
62 |
+
Name of this event
|
63 |
+
thing : str
|
64 |
+
Attribute of the given widget to watch
|
65 |
+
log_level : int
|
66 |
+
When the signal is triggered, a logging event of the given level
|
67 |
+
will be fired in the dfviz logger.
|
68 |
+
auto : bool
|
69 |
+
If True, automatically connects with a method in this class of the
|
70 |
+
same name.
|
71 |
+
"""
|
72 |
+
if name not in self.signals:
|
73 |
+
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
|
74 |
+
self._sigs[name] = {
|
75 |
+
"widget": widget,
|
76 |
+
"callbacks": [],
|
77 |
+
"thing": thing,
|
78 |
+
"log": log_level,
|
79 |
+
}
|
80 |
+
wn = "-".join(
|
81 |
+
[
|
82 |
+
getattr(widget, "name", str(widget)) if widget is not None else "none",
|
83 |
+
thing,
|
84 |
+
]
|
85 |
+
)
|
86 |
+
self._map[wn] = name
|
87 |
+
if widget is not None:
|
88 |
+
widget.param.watch(self._signal, thing, onlychanged=True)
|
89 |
+
if auto and hasattr(self, name):
|
90 |
+
self.connect(name, getattr(self, name))
|
91 |
+
|
92 |
+
def _repr_mimebundle_(self, *args, **kwargs):
|
93 |
+
"""Display in a notebook or a server"""
|
94 |
+
try:
|
95 |
+
return self.panel._repr_mimebundle_(*args, **kwargs)
|
96 |
+
except (ValueError, AttributeError):
|
97 |
+
raise NotImplementedError("Panel does not seem to be set " "up properly")
|
98 |
+
|
99 |
+
def connect(self, signal, slot):
|
100 |
+
"""Associate call back with given event
|
101 |
+
|
102 |
+
The callback must be a function which takes the "new" value of the
|
103 |
+
watched attribute as the only parameter. If the callback return False,
|
104 |
+
this cancels any further processing of the given event.
|
105 |
+
|
106 |
+
Alternatively, the callback can be a string, in which case it means
|
107 |
+
emitting the correspondingly-named event (i.e., connect to self)
|
108 |
+
"""
|
109 |
+
self._sigs[signal]["callbacks"].append(slot)
|
110 |
+
|
111 |
+
def _signal(self, event):
|
112 |
+
"""This is called by a an action on a widget
|
113 |
+
|
114 |
+
Within an self.ignore_events context, nothing happens.
|
115 |
+
|
116 |
+
Tests can execute this method by directly changing the values of
|
117 |
+
widget components.
|
118 |
+
"""
|
119 |
+
if not self._ignoring_events:
|
120 |
+
wn = "-".join([event.obj.name, event.name])
|
121 |
+
if wn in self._map and self._map[wn] in self._sigs:
|
122 |
+
self._emit(self._map[wn], event.new)
|
123 |
+
|
124 |
+
@contextlib.contextmanager
|
125 |
+
def ignore_events(self):
|
126 |
+
"""Temporarily turn off events processing in this instance
|
127 |
+
|
128 |
+
(does not propagate to children)
|
129 |
+
"""
|
130 |
+
self._ignoring_events = True
|
131 |
+
try:
|
132 |
+
yield
|
133 |
+
finally:
|
134 |
+
self._ignoring_events = False
|
135 |
+
|
136 |
+
def _emit(self, sig, value=None):
|
137 |
+
"""An event happened, call its callbacks
|
138 |
+
|
139 |
+
This method can be used in tests to simulate message passing without
|
140 |
+
directly changing visual elements.
|
141 |
+
|
142 |
+
Calling of callbacks will halt whenever one returns False.
|
143 |
+
"""
|
144 |
+
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
|
145 |
+
for callback in self._sigs[sig]["callbacks"]:
|
146 |
+
if isinstance(callback, str):
|
147 |
+
self._emit(callback)
|
148 |
+
else:
|
149 |
+
try:
|
150 |
+
# running callbacks should not break the interface
|
151 |
+
ret = callback(value)
|
152 |
+
if ret is False:
|
153 |
+
break
|
154 |
+
except Exception as e:
|
155 |
+
logger.exception(
|
156 |
+
"Exception (%s) while executing callback for signal: %s"
|
157 |
+
"" % (e, sig)
|
158 |
+
)
|
159 |
+
|
160 |
+
def show(self, threads=False):
|
161 |
+
"""Open a new browser tab and display this instance's interface"""
|
162 |
+
self.panel.show(threads=threads, verbose=False)
|
163 |
+
return self
|
164 |
+
|
165 |
+
|
166 |
+
class SingleSelect(SigSlot):
|
167 |
+
"""A multiselect which only allows you to select one item for an event"""
|
168 |
+
|
169 |
+
signals = ["_selected", "selected"] # the first is internal
|
170 |
+
slots = ["set_options", "set_selection", "add", "clear", "select"]
|
171 |
+
|
172 |
+
def __init__(self, **kwargs):
|
173 |
+
self.kwargs = kwargs
|
174 |
+
super().__init__()
|
175 |
+
|
176 |
+
def _setup(self):
|
177 |
+
self.panel = pn.widgets.MultiSelect(**self.kwargs)
|
178 |
+
self._register(self.panel, "_selected", "value")
|
179 |
+
self._register(None, "selected")
|
180 |
+
self.connect("_selected", self.select_one)
|
181 |
+
|
182 |
+
def _signal(self, *args, **kwargs):
|
183 |
+
super()._signal(*args, **kwargs)
|
184 |
+
|
185 |
+
def select_one(self, *_):
|
186 |
+
with self.ignore_events():
|
187 |
+
val = [self.panel.value[-1]] if self.panel.value else []
|
188 |
+
self.panel.value = val
|
189 |
+
self._emit("selected", self.panel.value)
|
190 |
+
|
191 |
+
def set_options(self, options):
|
192 |
+
self.panel.options = options
|
193 |
+
|
194 |
+
def clear(self):
|
195 |
+
self.panel.options = []
|
196 |
+
|
197 |
+
@property
|
198 |
+
def value(self):
|
199 |
+
return self.panel.value
|
200 |
+
|
201 |
+
def set_selection(self, selection):
|
202 |
+
self.panel.value = [selection]
|
203 |
+
|
204 |
+
|
205 |
+
class FileSelector(SigSlot):
|
206 |
+
"""Panel-based graphical file selector widget
|
207 |
+
|
208 |
+
Instances of this widget are interactive and can be displayed in jupyter by having
|
209 |
+
them as the output of a cell, or in a separate browser tab using ``.show()``.
|
210 |
+
"""
|
211 |
+
|
212 |
+
signals = [
|
213 |
+
"protocol_changed",
|
214 |
+
"selection_changed",
|
215 |
+
"directory_entered",
|
216 |
+
"home_clicked",
|
217 |
+
"up_clicked",
|
218 |
+
"go_clicked",
|
219 |
+
"filters_changed",
|
220 |
+
]
|
221 |
+
slots = ["set_filters", "go_home"]
|
222 |
+
|
223 |
+
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
|
224 |
+
"""
|
225 |
+
|
226 |
+
Parameters
|
227 |
+
----------
|
228 |
+
url : str (optional)
|
229 |
+
Initial value of the URL to populate the dialog; should include protocol
|
230 |
+
filters : list(str) (optional)
|
231 |
+
File endings to include in the listings. If not included, all files are
|
232 |
+
allowed. Does not affect directories.
|
233 |
+
If given, the endings will appear as checkboxes in the interface
|
234 |
+
ignore : list(str) (optional)
|
235 |
+
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
|
236 |
+
hidden files on posix
|
237 |
+
kwargs : dict (optional)
|
238 |
+
To pass to file system instance
|
239 |
+
"""
|
240 |
+
if url:
|
241 |
+
self.init_protocol, url = split_protocol(url)
|
242 |
+
else:
|
243 |
+
self.init_protocol, url = "file", os.getcwd()
|
244 |
+
self.init_url = url
|
245 |
+
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
|
246 |
+
self.filters = filters
|
247 |
+
self.ignore = [re.compile(i) for i in ignore or []]
|
248 |
+
self._fs = None
|
249 |
+
super().__init__()
|
250 |
+
|
251 |
+
def _setup(self):
|
252 |
+
self.url = pn.widgets.TextInput(
|
253 |
+
name="url",
|
254 |
+
value=self.init_url,
|
255 |
+
align="end",
|
256 |
+
sizing_mode="stretch_width",
|
257 |
+
width_policy="max",
|
258 |
+
)
|
259 |
+
self.protocol = pn.widgets.Select(
|
260 |
+
options=sorted(known_implementations),
|
261 |
+
value=self.init_protocol,
|
262 |
+
name="protocol",
|
263 |
+
align="center",
|
264 |
+
)
|
265 |
+
self.kwargs = pn.widgets.TextInput(
|
266 |
+
name="kwargs", value=self.init_kwargs, align="center"
|
267 |
+
)
|
268 |
+
self.go = pn.widgets.Button(name="⇨", align="end", width=45)
|
269 |
+
self.main = SingleSelect(size=10)
|
270 |
+
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
|
271 |
+
self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
|
272 |
+
|
273 |
+
self._register(self.protocol, "protocol_changed", auto=True)
|
274 |
+
self._register(self.go, "go_clicked", "clicks", auto=True)
|
275 |
+
self._register(self.up, "up_clicked", "clicks", auto=True)
|
276 |
+
self._register(self.home, "home_clicked", "clicks", auto=True)
|
277 |
+
self._register(None, "selection_changed")
|
278 |
+
self.main.connect("selected", self.selection_changed)
|
279 |
+
self._register(None, "directory_entered")
|
280 |
+
self.prev_protocol = self.protocol.value
|
281 |
+
self.prev_kwargs = self.storage_options
|
282 |
+
|
283 |
+
self.filter_sel = pn.widgets.CheckBoxGroup(
|
284 |
+
value=[], options=[], inline=False, align="end", width_policy="min"
|
285 |
+
)
|
286 |
+
self._register(self.filter_sel, "filters_changed", auto=True)
|
287 |
+
|
288 |
+
self.panel = pn.Column(
|
289 |
+
pn.Row(self.protocol, self.kwargs),
|
290 |
+
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
|
291 |
+
self.main.panel,
|
292 |
+
)
|
293 |
+
self.set_filters(self.filters)
|
294 |
+
self.go_clicked()
|
295 |
+
|
296 |
+
def set_filters(self, filters=None):
|
297 |
+
self.filters = filters
|
298 |
+
if filters:
|
299 |
+
self.filter_sel.options = filters
|
300 |
+
self.filter_sel.value = filters
|
301 |
+
else:
|
302 |
+
self.filter_sel.options = []
|
303 |
+
self.filter_sel.value = []
|
304 |
+
|
305 |
+
@property
|
306 |
+
def storage_options(self):
|
307 |
+
"""Value of the kwargs box as a dictionary"""
|
308 |
+
return ast.literal_eval(self.kwargs.value) or {}
|
309 |
+
|
310 |
+
@property
|
311 |
+
def fs(self):
|
312 |
+
"""Current filesystem instance"""
|
313 |
+
if self._fs is None:
|
314 |
+
cls = get_filesystem_class(self.protocol.value)
|
315 |
+
self._fs = cls(**self.storage_options)
|
316 |
+
return self._fs
|
317 |
+
|
318 |
+
@property
|
319 |
+
def urlpath(self):
|
320 |
+
"""URL of currently selected item"""
|
321 |
+
return (
|
322 |
+
(f"{self.protocol.value}://{self.main.value[0]}")
|
323 |
+
if self.main.value
|
324 |
+
else None
|
325 |
+
)
|
326 |
+
|
327 |
+
def open_file(self, mode="rb", compression=None, encoding=None):
|
328 |
+
"""Create OpenFile instance for the currently selected item
|
329 |
+
|
330 |
+
For example, in a notebook you might do something like
|
331 |
+
|
332 |
+
.. code-block::
|
333 |
+
|
334 |
+
[ ]: sel = FileSelector(); sel
|
335 |
+
|
336 |
+
# user selects their file
|
337 |
+
|
338 |
+
[ ]: with sel.open_file('rb') as f:
|
339 |
+
... out = f.read()
|
340 |
+
|
341 |
+
Parameters
|
342 |
+
----------
|
343 |
+
mode: str (optional)
|
344 |
+
Open mode for the file.
|
345 |
+
compression: str (optional)
|
346 |
+
The interact with the file as compressed. Set to 'infer' to guess
|
347 |
+
compression from the file ending
|
348 |
+
encoding: str (optional)
|
349 |
+
If using text mode, use this encoding; defaults to UTF8.
|
350 |
+
"""
|
351 |
+
if self.urlpath is None:
|
352 |
+
raise ValueError("No file selected")
|
353 |
+
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
|
354 |
+
|
355 |
+
def filters_changed(self, values):
|
356 |
+
self.filters = values
|
357 |
+
self.go_clicked()
|
358 |
+
|
359 |
+
def selection_changed(self, *_):
|
360 |
+
if self.urlpath is None:
|
361 |
+
return
|
362 |
+
if self.fs.isdir(self.urlpath):
|
363 |
+
self.url.value = self.fs._strip_protocol(self.urlpath)
|
364 |
+
self.go_clicked()
|
365 |
+
|
366 |
+
def go_clicked(self, *_):
|
367 |
+
if (
|
368 |
+
self.prev_protocol != self.protocol.value
|
369 |
+
or self.prev_kwargs != self.storage_options
|
370 |
+
):
|
371 |
+
self._fs = None # causes fs to be recreated
|
372 |
+
self.prev_protocol = self.protocol.value
|
373 |
+
self.prev_kwargs = self.storage_options
|
374 |
+
listing = sorted(
|
375 |
+
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
|
376 |
+
)
|
377 |
+
listing = [
|
378 |
+
l
|
379 |
+
for l in listing
|
380 |
+
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
|
381 |
+
]
|
382 |
+
folders = {
|
383 |
+
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
384 |
+
for o in listing
|
385 |
+
if o["type"] == "directory"
|
386 |
+
}
|
387 |
+
files = {
|
388 |
+
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
389 |
+
for o in listing
|
390 |
+
if o["type"] == "file"
|
391 |
+
}
|
392 |
+
if self.filters:
|
393 |
+
files = {
|
394 |
+
k: v
|
395 |
+
for k, v in files.items()
|
396 |
+
if any(v.endswith(ext) for ext in self.filters)
|
397 |
+
}
|
398 |
+
self.main.set_options(dict(**folders, **files))
|
399 |
+
|
400 |
+
def protocol_changed(self, *_):
|
401 |
+
self._fs = None
|
402 |
+
self.main.options = []
|
403 |
+
self.url.value = ""
|
404 |
+
|
405 |
+
def home_clicked(self, *_):
|
406 |
+
self.protocol.value = self.init_protocol
|
407 |
+
self.kwargs.value = self.init_kwargs
|
408 |
+
self.url.value = self.init_url
|
409 |
+
self.go_clicked()
|
410 |
+
|
411 |
+
def up_clicked(self, *_):
|
412 |
+
self.url.value = self.fs._parent(self.url.value)
|
413 |
+
self.go_clicked()
|
lib/python3.11/site-packages/fsspec/implementations/__init__.py
ADDED
File without changes
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (237 Bytes). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/arrow.cpython-311.pyc
ADDED
Binary file (15.2 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc
ADDED
Binary file (4.84 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc
ADDED
Binary file (12.8 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc
ADDED
Binary file (45.6 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc
ADDED
Binary file (7.8 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/data.cpython-311.pyc
ADDED
Binary file (2.81 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dbfs.cpython-311.pyc
ADDED
Binary file (19.6 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/dirfs.cpython-311.pyc
ADDED
Binary file (25.3 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/ftp.cpython-311.pyc
ADDED
Binary file (19.4 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/git.cpython-311.pyc
ADDED
Binary file (6.19 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/github.cpython-311.pyc
ADDED
Binary file (11.4 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/http.cpython-311.pyc
ADDED
Binary file (45.2 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/jupyter.cpython-311.pyc
ADDED
Binary file (7.33 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/libarchive.cpython-311.pyc
ADDED
Binary file (10.3 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc
ADDED
Binary file (25.4 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/memory.cpython-311.pyc
ADDED
Binary file (15.3 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc
ADDED
Binary file (65.6 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/sftp.cpython-311.pyc
ADDED
Binary file (10.9 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/smb.cpython-311.pyc
ADDED
Binary file (16.4 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/tar.cpython-311.pyc
ADDED
Binary file (5.41 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc
ADDED
Binary file (24.2 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc
ADDED
Binary file (6.67 kB). View file
|
|
lib/python3.11/site-packages/fsspec/implementations/arrow.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import errno
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
import secrets
|
5 |
+
import shutil
|
6 |
+
from contextlib import suppress
|
7 |
+
from functools import cached_property, wraps
|
8 |
+
|
9 |
+
from fsspec.spec import AbstractFileSystem
|
10 |
+
from fsspec.utils import (
|
11 |
+
get_package_version_without_import,
|
12 |
+
infer_storage_options,
|
13 |
+
mirror_from,
|
14 |
+
tokenize,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def wrap_exceptions(func):
|
19 |
+
@wraps(func)
|
20 |
+
def wrapper(*args, **kwargs):
|
21 |
+
try:
|
22 |
+
return func(*args, **kwargs)
|
23 |
+
except OSError as exception:
|
24 |
+
if not exception.args:
|
25 |
+
raise
|
26 |
+
|
27 |
+
message, *args = exception.args
|
28 |
+
if isinstance(message, str) and "does not exist" in message:
|
29 |
+
raise FileNotFoundError(errno.ENOENT, message) from exception
|
30 |
+
else:
|
31 |
+
raise
|
32 |
+
|
33 |
+
return wrapper
|
34 |
+
|
35 |
+
|
36 |
+
PYARROW_VERSION = None
|
37 |
+
|
38 |
+
|
39 |
+
class ArrowFSWrapper(AbstractFileSystem):
|
40 |
+
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
|
41 |
+
|
42 |
+
Parameters
|
43 |
+
----------
|
44 |
+
fs : pyarrow.fs.FileSystem
|
45 |
+
|
46 |
+
"""
|
47 |
+
|
48 |
+
root_marker = "/"
|
49 |
+
|
50 |
+
def __init__(self, fs, **kwargs):
|
51 |
+
global PYARROW_VERSION
|
52 |
+
PYARROW_VERSION = get_package_version_without_import("pyarrow")
|
53 |
+
self.fs = fs
|
54 |
+
super().__init__(**kwargs)
|
55 |
+
|
56 |
+
@property
|
57 |
+
def protocol(self):
|
58 |
+
return self.fs.type_name
|
59 |
+
|
60 |
+
@cached_property
|
61 |
+
def fsid(self):
|
62 |
+
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
|
63 |
+
|
64 |
+
@classmethod
|
65 |
+
def _strip_protocol(cls, path):
|
66 |
+
ops = infer_storage_options(path)
|
67 |
+
path = ops["path"]
|
68 |
+
if path.startswith("//"):
|
69 |
+
# special case for "hdfs://path" (without the triple slash)
|
70 |
+
path = path[1:]
|
71 |
+
return path
|
72 |
+
|
73 |
+
def ls(self, path, detail=False, **kwargs):
|
74 |
+
path = self._strip_protocol(path)
|
75 |
+
from pyarrow.fs import FileSelector
|
76 |
+
|
77 |
+
entries = [
|
78 |
+
self._make_entry(entry)
|
79 |
+
for entry in self.fs.get_file_info(FileSelector(path))
|
80 |
+
]
|
81 |
+
if detail:
|
82 |
+
return entries
|
83 |
+
else:
|
84 |
+
return [entry["name"] for entry in entries]
|
85 |
+
|
86 |
+
def info(self, path, **kwargs):
|
87 |
+
path = self._strip_protocol(path)
|
88 |
+
[info] = self.fs.get_file_info([path])
|
89 |
+
return self._make_entry(info)
|
90 |
+
|
91 |
+
def exists(self, path):
|
92 |
+
path = self._strip_protocol(path)
|
93 |
+
try:
|
94 |
+
self.info(path)
|
95 |
+
except FileNotFoundError:
|
96 |
+
return False
|
97 |
+
else:
|
98 |
+
return True
|
99 |
+
|
100 |
+
def _make_entry(self, info):
|
101 |
+
from pyarrow.fs import FileType
|
102 |
+
|
103 |
+
if info.type is FileType.Directory:
|
104 |
+
kind = "directory"
|
105 |
+
elif info.type is FileType.File:
|
106 |
+
kind = "file"
|
107 |
+
elif info.type is FileType.NotFound:
|
108 |
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
|
109 |
+
else:
|
110 |
+
kind = "other"
|
111 |
+
|
112 |
+
return {
|
113 |
+
"name": info.path,
|
114 |
+
"size": info.size,
|
115 |
+
"type": kind,
|
116 |
+
"mtime": info.mtime,
|
117 |
+
}
|
118 |
+
|
119 |
+
@wrap_exceptions
|
120 |
+
def cp_file(self, path1, path2, **kwargs):
|
121 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
122 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
123 |
+
|
124 |
+
with self._open(path1, "rb") as lstream:
|
125 |
+
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
|
126 |
+
try:
|
127 |
+
with self.open(tmp_fname, "wb") as rstream:
|
128 |
+
shutil.copyfileobj(lstream, rstream)
|
129 |
+
self.fs.move(tmp_fname, path2)
|
130 |
+
except BaseException: # noqa
|
131 |
+
with suppress(FileNotFoundError):
|
132 |
+
self.fs.delete_file(tmp_fname)
|
133 |
+
raise
|
134 |
+
|
135 |
+
@wrap_exceptions
|
136 |
+
def mv(self, path1, path2, **kwargs):
|
137 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
138 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
139 |
+
self.fs.move(path1, path2)
|
140 |
+
|
141 |
+
mv_file = mv
|
142 |
+
|
143 |
+
@wrap_exceptions
|
144 |
+
def rm_file(self, path):
|
145 |
+
path = self._strip_protocol(path)
|
146 |
+
self.fs.delete_file(path)
|
147 |
+
|
148 |
+
@wrap_exceptions
|
149 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
150 |
+
path = self._strip_protocol(path).rstrip("/")
|
151 |
+
if self.isdir(path):
|
152 |
+
if recursive:
|
153 |
+
self.fs.delete_dir(path)
|
154 |
+
else:
|
155 |
+
raise ValueError("Can't delete directories without recursive=False")
|
156 |
+
else:
|
157 |
+
self.fs.delete_file(path)
|
158 |
+
|
159 |
+
@wrap_exceptions
|
160 |
+
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
|
161 |
+
if mode == "rb":
|
162 |
+
if seekable:
|
163 |
+
method = self.fs.open_input_file
|
164 |
+
else:
|
165 |
+
method = self.fs.open_input_stream
|
166 |
+
elif mode == "wb":
|
167 |
+
method = self.fs.open_output_stream
|
168 |
+
elif mode == "ab":
|
169 |
+
method = self.fs.open_append_stream
|
170 |
+
else:
|
171 |
+
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
|
172 |
+
|
173 |
+
_kwargs = {}
|
174 |
+
if mode != "rb" or not seekable:
|
175 |
+
if int(PYARROW_VERSION.split(".")[0]) >= 4:
|
176 |
+
# disable compression auto-detection
|
177 |
+
_kwargs["compression"] = None
|
178 |
+
stream = method(path, **_kwargs)
|
179 |
+
|
180 |
+
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
|
181 |
+
|
182 |
+
@wrap_exceptions
|
183 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
184 |
+
path = self._strip_protocol(path)
|
185 |
+
if create_parents:
|
186 |
+
self.makedirs(path, exist_ok=True)
|
187 |
+
else:
|
188 |
+
self.fs.create_dir(path, recursive=False)
|
189 |
+
|
190 |
+
@wrap_exceptions
|
191 |
+
def makedirs(self, path, exist_ok=False):
|
192 |
+
path = self._strip_protocol(path)
|
193 |
+
self.fs.create_dir(path, recursive=True)
|
194 |
+
|
195 |
+
@wrap_exceptions
|
196 |
+
def rmdir(self, path):
|
197 |
+
path = self._strip_protocol(path)
|
198 |
+
self.fs.delete_dir(path)
|
199 |
+
|
200 |
+
@wrap_exceptions
|
201 |
+
def modified(self, path):
|
202 |
+
path = self._strip_protocol(path)
|
203 |
+
return self.fs.get_file_info(path).mtime
|
204 |
+
|
205 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
206 |
+
kwargs["seekable"] = start not in [None, 0]
|
207 |
+
return super().cat_file(path, start=None, end=None, **kwargs)
|
208 |
+
|
209 |
+
def get_file(self, rpath, lpath, **kwargs):
|
210 |
+
kwargs["seekable"] = False
|
211 |
+
super().get_file(rpath, lpath, **kwargs)
|
212 |
+
|
213 |
+
|
214 |
+
@mirror_from(
|
215 |
+
"stream",
|
216 |
+
[
|
217 |
+
"read",
|
218 |
+
"seek",
|
219 |
+
"tell",
|
220 |
+
"write",
|
221 |
+
"readable",
|
222 |
+
"writable",
|
223 |
+
"close",
|
224 |
+
"size",
|
225 |
+
"seekable",
|
226 |
+
],
|
227 |
+
)
|
228 |
+
class ArrowFile(io.IOBase):
|
229 |
+
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
|
230 |
+
self.path = path
|
231 |
+
self.mode = mode
|
232 |
+
|
233 |
+
self.fs = fs
|
234 |
+
self.stream = stream
|
235 |
+
|
236 |
+
self.blocksize = self.block_size = block_size
|
237 |
+
self.kwargs = kwargs
|
238 |
+
|
239 |
+
def __enter__(self):
|
240 |
+
return self
|
241 |
+
|
242 |
+
def __exit__(self, *args):
|
243 |
+
return self.close()
|
244 |
+
|
245 |
+
|
246 |
+
class HadoopFileSystem(ArrowFSWrapper):
|
247 |
+
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
|
248 |
+
to connect it's interface with fsspec"""
|
249 |
+
|
250 |
+
protocol = "hdfs"
|
251 |
+
|
252 |
+
def __init__(
|
253 |
+
self,
|
254 |
+
host="default",
|
255 |
+
port=0,
|
256 |
+
user=None,
|
257 |
+
kerb_ticket=None,
|
258 |
+
extra_conf=None,
|
259 |
+
**kwargs,
|
260 |
+
):
|
261 |
+
"""
|
262 |
+
|
263 |
+
Parameters
|
264 |
+
----------
|
265 |
+
host: str
|
266 |
+
Hostname, IP or "default" to try to read from Hadoop config
|
267 |
+
port: int
|
268 |
+
Port to connect on, or default from Hadoop config if 0
|
269 |
+
user: str or None
|
270 |
+
If given, connect as this username
|
271 |
+
kerb_ticket: str or None
|
272 |
+
If given, use this ticket for authentication
|
273 |
+
extra_conf: None or dict
|
274 |
+
Passed on to HadoopFileSystem
|
275 |
+
"""
|
276 |
+
from pyarrow.fs import HadoopFileSystem
|
277 |
+
|
278 |
+
fs = HadoopFileSystem(
|
279 |
+
host=host,
|
280 |
+
port=port,
|
281 |
+
user=user,
|
282 |
+
kerb_ticket=kerb_ticket,
|
283 |
+
extra_conf=extra_conf,
|
284 |
+
)
|
285 |
+
super().__init__(fs=fs, **kwargs)
|
286 |
+
|
287 |
+
@staticmethod
|
288 |
+
def _get_kwargs_from_urls(path):
|
289 |
+
ops = infer_storage_options(path)
|
290 |
+
out = {}
|
291 |
+
if ops.get("host", None):
|
292 |
+
out["host"] = ops["host"]
|
293 |
+
if ops.get("username", None):
|
294 |
+
out["user"] = ops["username"]
|
295 |
+
if ops.get("port", None):
|
296 |
+
out["port"] = ops["port"]
|
297 |
+
return out
|
lib/python3.11/site-packages/fsspec/implementations/cache_mapper.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import abc
|
4 |
+
import hashlib
|
5 |
+
from typing import TYPE_CHECKING
|
6 |
+
|
7 |
+
from fsspec.implementations.local import make_path_posix
|
8 |
+
|
9 |
+
if TYPE_CHECKING:
|
10 |
+
from typing import Any
|
11 |
+
|
12 |
+
|
13 |
+
class AbstractCacheMapper(abc.ABC):
|
14 |
+
"""Abstract super-class for mappers from remote URLs to local cached
|
15 |
+
basenames.
|
16 |
+
"""
|
17 |
+
|
18 |
+
@abc.abstractmethod
|
19 |
+
def __call__(self, path: str) -> str:
|
20 |
+
...
|
21 |
+
|
22 |
+
def __eq__(self, other: Any) -> bool:
|
23 |
+
# Identity only depends on class. When derived classes have attributes
|
24 |
+
# they will need to be included.
|
25 |
+
return isinstance(other, type(self))
|
26 |
+
|
27 |
+
def __hash__(self) -> int:
|
28 |
+
# Identity only depends on class. When derived classes have attributes
|
29 |
+
# they will need to be included.
|
30 |
+
return hash(type(self))
|
31 |
+
|
32 |
+
|
33 |
+
class BasenameCacheMapper(AbstractCacheMapper):
|
34 |
+
"""Cache mapper that uses the basename of the remote URL and a fixed number
|
35 |
+
of directory levels above this.
|
36 |
+
|
37 |
+
The default is zero directory levels, meaning different paths with the same
|
38 |
+
basename will have the same cached basename.
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(self, directory_levels: int = 0):
|
42 |
+
if directory_levels < 0:
|
43 |
+
raise ValueError(
|
44 |
+
"BasenameCacheMapper requires zero or positive directory_levels"
|
45 |
+
)
|
46 |
+
self.directory_levels = directory_levels
|
47 |
+
|
48 |
+
# Separator for directories when encoded as strings.
|
49 |
+
self._separator = "_@_"
|
50 |
+
|
51 |
+
def __call__(self, path: str) -> str:
|
52 |
+
path = make_path_posix(path)
|
53 |
+
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
|
54 |
+
if bits:
|
55 |
+
return self._separator.join(bits)
|
56 |
+
else:
|
57 |
+
return prefix # No separator found, simple filename
|
58 |
+
|
59 |
+
def __eq__(self, other: Any) -> bool:
|
60 |
+
return super().__eq__(other) and self.directory_levels == other.directory_levels
|
61 |
+
|
62 |
+
def __hash__(self) -> int:
|
63 |
+
return super().__hash__() ^ hash(self.directory_levels)
|
64 |
+
|
65 |
+
|
66 |
+
class HashCacheMapper(AbstractCacheMapper):
|
67 |
+
"""Cache mapper that uses a hash of the remote URL."""
|
68 |
+
|
69 |
+
def __call__(self, path: str) -> str:
|
70 |
+
return hashlib.sha256(path.encode()).hexdigest()
|
71 |
+
|
72 |
+
|
73 |
+
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
|
74 |
+
"""Factory method to create cache mapper for backward compatibility with
|
75 |
+
``CachingFileSystem`` constructor using ``same_names`` kwarg.
|
76 |
+
"""
|
77 |
+
if same_names:
|
78 |
+
return BasenameCacheMapper()
|
79 |
+
else:
|
80 |
+
return HashCacheMapper()
|
lib/python3.11/site-packages/fsspec/implementations/cache_metadata.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import os
|
4 |
+
import pickle
|
5 |
+
import time
|
6 |
+
from typing import TYPE_CHECKING
|
7 |
+
|
8 |
+
from fsspec.utils import atomic_write
|
9 |
+
|
10 |
+
try:
|
11 |
+
import ujson as json
|
12 |
+
except ImportError:
|
13 |
+
if not TYPE_CHECKING:
|
14 |
+
import json
|
15 |
+
|
16 |
+
if TYPE_CHECKING:
|
17 |
+
from typing import Any, Dict, Iterator, Literal
|
18 |
+
|
19 |
+
from typing_extensions import TypeAlias
|
20 |
+
|
21 |
+
from .cached import CachingFileSystem
|
22 |
+
|
23 |
+
Detail: TypeAlias = Dict[str, Any]
|
24 |
+
|
25 |
+
|
26 |
+
class CacheMetadata:
|
27 |
+
"""Cache metadata.
|
28 |
+
|
29 |
+
All reading and writing of cache metadata is performed by this class,
|
30 |
+
accessing the cached files and blocks is not.
|
31 |
+
|
32 |
+
Metadata is stored in a single file per storage directory in JSON format.
|
33 |
+
For backward compatibility, also reads metadata stored in pickle format
|
34 |
+
which is converted to JSON when next saved.
|
35 |
+
"""
|
36 |
+
|
37 |
+
def __init__(self, storage: list[str]):
|
38 |
+
"""
|
39 |
+
|
40 |
+
Parameters
|
41 |
+
----------
|
42 |
+
storage: list[str]
|
43 |
+
Directories containing cached files, must be at least one. Metadata
|
44 |
+
is stored in the last of these directories by convention.
|
45 |
+
"""
|
46 |
+
if not storage:
|
47 |
+
raise ValueError("CacheMetadata expects at least one storage location")
|
48 |
+
|
49 |
+
self._storage = storage
|
50 |
+
self.cached_files: list[Detail] = [{}]
|
51 |
+
|
52 |
+
# Private attribute to force saving of metadata in pickle format rather than
|
53 |
+
# JSON for use in tests to confirm can read both pickle and JSON formats.
|
54 |
+
self._force_save_pickle = False
|
55 |
+
|
56 |
+
def _load(self, fn: str) -> Detail:
|
57 |
+
"""Low-level function to load metadata from specific file"""
|
58 |
+
try:
|
59 |
+
with open(fn, "r") as f:
|
60 |
+
return json.load(f)
|
61 |
+
except ValueError:
|
62 |
+
with open(fn, "rb") as f:
|
63 |
+
return pickle.load(f)
|
64 |
+
|
65 |
+
def _save(self, metadata_to_save: Detail, fn: str) -> None:
|
66 |
+
"""Low-level function to save metadata to specific file"""
|
67 |
+
if self._force_save_pickle:
|
68 |
+
with atomic_write(fn) as f:
|
69 |
+
pickle.dump(metadata_to_save, f)
|
70 |
+
else:
|
71 |
+
with atomic_write(fn, mode="w") as f:
|
72 |
+
json.dump(metadata_to_save, f)
|
73 |
+
|
74 |
+
def _scan_locations(
|
75 |
+
self, writable_only: bool = False
|
76 |
+
) -> Iterator[tuple[str, str, bool]]:
|
77 |
+
"""Yield locations (filenames) where metadata is stored, and whether
|
78 |
+
writable or not.
|
79 |
+
|
80 |
+
Parameters
|
81 |
+
----------
|
82 |
+
writable: bool
|
83 |
+
Set to True to only yield writable locations.
|
84 |
+
|
85 |
+
Returns
|
86 |
+
-------
|
87 |
+
Yields (str, str, bool)
|
88 |
+
"""
|
89 |
+
n = len(self._storage)
|
90 |
+
for i, storage in enumerate(self._storage):
|
91 |
+
writable = i == n - 1
|
92 |
+
if writable_only and not writable:
|
93 |
+
continue
|
94 |
+
yield os.path.join(storage, "cache"), storage, writable
|
95 |
+
|
96 |
+
def check_file(
|
97 |
+
self, path: str, cfs: CachingFileSystem | None
|
98 |
+
) -> Literal[False] | tuple[Detail, str]:
|
99 |
+
"""If path is in cache return its details, otherwise return ``False``.
|
100 |
+
|
101 |
+
If the optional CachingFileSystem is specified then it is used to
|
102 |
+
perform extra checks to reject possible matches, such as if they are
|
103 |
+
too old.
|
104 |
+
"""
|
105 |
+
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
|
106 |
+
if path not in cache:
|
107 |
+
continue
|
108 |
+
detail = cache[path].copy()
|
109 |
+
|
110 |
+
if cfs is not None:
|
111 |
+
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
|
112 |
+
# Wrong file as determined by hash of file properties
|
113 |
+
continue
|
114 |
+
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
|
115 |
+
# Cached file has expired
|
116 |
+
continue
|
117 |
+
|
118 |
+
fn = os.path.join(base, detail["fn"])
|
119 |
+
if os.path.exists(fn):
|
120 |
+
return detail, fn
|
121 |
+
return False
|
122 |
+
|
123 |
+
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
|
124 |
+
"""Remove expired metadata from the cache.
|
125 |
+
|
126 |
+
Returns names of files corresponding to expired metadata and a boolean
|
127 |
+
flag indicating whether the writable cache is empty. Caller is
|
128 |
+
responsible for deleting the expired files.
|
129 |
+
"""
|
130 |
+
expired_files = []
|
131 |
+
for path, detail in self.cached_files[-1].copy().items():
|
132 |
+
if time.time() - detail["time"] > expiry_time:
|
133 |
+
fn = detail.get("fn", "")
|
134 |
+
if not fn:
|
135 |
+
raise RuntimeError(
|
136 |
+
f"Cache metadata does not contain 'fn' for {path}"
|
137 |
+
)
|
138 |
+
fn = os.path.join(self._storage[-1], fn)
|
139 |
+
expired_files.append(fn)
|
140 |
+
self.cached_files[-1].pop(path)
|
141 |
+
|
142 |
+
if self.cached_files[-1]:
|
143 |
+
cache_path = os.path.join(self._storage[-1], "cache")
|
144 |
+
self._save(self.cached_files[-1], cache_path)
|
145 |
+
|
146 |
+
writable_cache_empty = not self.cached_files[-1]
|
147 |
+
return expired_files, writable_cache_empty
|
148 |
+
|
149 |
+
def load(self) -> None:
|
150 |
+
"""Load all metadata from disk and store in ``self.cached_files``"""
|
151 |
+
cached_files = []
|
152 |
+
for fn, _, _ in self._scan_locations():
|
153 |
+
if os.path.exists(fn):
|
154 |
+
# TODO: consolidate blocks here
|
155 |
+
loaded_cached_files = self._load(fn)
|
156 |
+
for c in loaded_cached_files.values():
|
157 |
+
if isinstance(c["blocks"], list):
|
158 |
+
c["blocks"] = set(c["blocks"])
|
159 |
+
cached_files.append(loaded_cached_files)
|
160 |
+
else:
|
161 |
+
cached_files.append({})
|
162 |
+
self.cached_files = cached_files or [{}]
|
163 |
+
|
164 |
+
def on_close_cached_file(self, f: Any, path: str) -> None:
|
165 |
+
"""Perform side-effect actions on closing a cached file.
|
166 |
+
|
167 |
+
The actual closing of the file is the responsibility of the caller.
|
168 |
+
"""
|
169 |
+
# File must be writeble, so in self.cached_files[-1]
|
170 |
+
c = self.cached_files[-1][path]
|
171 |
+
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
|
172 |
+
c["blocks"] = True
|
173 |
+
|
174 |
+
def pop_file(self, path: str) -> str | None:
|
175 |
+
"""Remove metadata of cached file.
|
176 |
+
|
177 |
+
If path is in the cache, return the filename of the cached file,
|
178 |
+
otherwise return ``None``. Caller is responsible for deleting the
|
179 |
+
cached file.
|
180 |
+
"""
|
181 |
+
details = self.check_file(path, None)
|
182 |
+
if not details:
|
183 |
+
return None
|
184 |
+
_, fn = details
|
185 |
+
if fn.startswith(self._storage[-1]):
|
186 |
+
self.cached_files[-1].pop(path)
|
187 |
+
self.save()
|
188 |
+
else:
|
189 |
+
raise PermissionError(
|
190 |
+
"Can only delete cached file in last, writable cache location"
|
191 |
+
)
|
192 |
+
return fn
|
193 |
+
|
194 |
+
def save(self) -> None:
|
195 |
+
"""Save metadata to disk"""
|
196 |
+
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
|
197 |
+
if not writable:
|
198 |
+
continue
|
199 |
+
|
200 |
+
if os.path.exists(fn):
|
201 |
+
cached_files = self._load(fn)
|
202 |
+
for k, c in cached_files.items():
|
203 |
+
if k in cache:
|
204 |
+
if c["blocks"] is True or cache[k]["blocks"] is True:
|
205 |
+
c["blocks"] = True
|
206 |
+
else:
|
207 |
+
# self.cached_files[*][*]["blocks"] must continue to
|
208 |
+
# point to the same set object so that updates
|
209 |
+
# performed by MMapCache are propagated back to
|
210 |
+
# self.cached_files.
|
211 |
+
blocks = cache[k]["blocks"]
|
212 |
+
blocks.update(c["blocks"])
|
213 |
+
c["blocks"] = blocks
|
214 |
+
c["time"] = max(c["time"], cache[k]["time"])
|
215 |
+
c["uid"] = cache[k]["uid"]
|
216 |
+
|
217 |
+
# Files can be added to cache after it was written once
|
218 |
+
for k, c in cache.items():
|
219 |
+
if k not in cached_files:
|
220 |
+
cached_files[k] = c
|
221 |
+
else:
|
222 |
+
cached_files = cache
|
223 |
+
cache = {k: v.copy() for k, v in cached_files.items()}
|
224 |
+
for c in cache.values():
|
225 |
+
if isinstance(c["blocks"], set):
|
226 |
+
c["blocks"] = list(c["blocks"])
|
227 |
+
self._save(cache, fn)
|
228 |
+
self.cached_files[-1] = cached_files
|
229 |
+
|
230 |
+
def update_file(self, path: str, detail: Detail) -> None:
|
231 |
+
"""Update metadata for specific file in memory, do not save"""
|
232 |
+
self.cached_files[-1][path] = detail
|
lib/python3.11/site-packages/fsspec/implementations/cached.py
ADDED
@@ -0,0 +1,864 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import inspect
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
import time
|
8 |
+
import weakref
|
9 |
+
from shutil import rmtree
|
10 |
+
from typing import TYPE_CHECKING, Any, Callable, ClassVar
|
11 |
+
|
12 |
+
from fsspec import AbstractFileSystem, filesystem
|
13 |
+
from fsspec.callbacks import _DEFAULT_CALLBACK
|
14 |
+
from fsspec.compression import compr
|
15 |
+
from fsspec.core import BaseCache, MMapCache
|
16 |
+
from fsspec.exceptions import BlocksizeMismatchError
|
17 |
+
from fsspec.implementations.cache_mapper import create_cache_mapper
|
18 |
+
from fsspec.implementations.cache_metadata import CacheMetadata
|
19 |
+
from fsspec.spec import AbstractBufferedFile
|
20 |
+
from fsspec.transaction import Transaction
|
21 |
+
from fsspec.utils import infer_compression
|
22 |
+
|
23 |
+
if TYPE_CHECKING:
|
24 |
+
from fsspec.implementations.cache_mapper import AbstractCacheMapper
|
25 |
+
|
26 |
+
logger = logging.getLogger("fsspec.cached")
|
27 |
+
|
28 |
+
|
29 |
+
class WriteCachedTransaction(Transaction):
|
30 |
+
def complete(self, commit=True):
|
31 |
+
rpaths = [f.path for f in self.files]
|
32 |
+
lpaths = [f.fn for f in self.files]
|
33 |
+
if commit:
|
34 |
+
self.fs.put(lpaths, rpaths)
|
35 |
+
# else remove?
|
36 |
+
self.fs._intrans = False
|
37 |
+
|
38 |
+
|
39 |
+
class CachingFileSystem(AbstractFileSystem):
|
40 |
+
"""Locally caching filesystem, layer over any other FS
|
41 |
+
|
42 |
+
This class implements chunk-wise local storage of remote files, for quick
|
43 |
+
access after the initial download. The files are stored in a given
|
44 |
+
directory with hashes of URLs for the filenames. If no directory is given,
|
45 |
+
a temporary one is used, which should be cleaned up by the OS after the
|
46 |
+
process ends. The files themselves are sparse (as implemented in
|
47 |
+
:class:`~fsspec.caching.MMapCache`), so only the data which is accessed
|
48 |
+
takes up space.
|
49 |
+
|
50 |
+
Restrictions:
|
51 |
+
|
52 |
+
- the block-size must be the same for each access of a given file, unless
|
53 |
+
all blocks of the file have already been read
|
54 |
+
- caching can only be applied to file-systems which produce files
|
55 |
+
derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
|
56 |
+
allowed, for testing
|
57 |
+
"""
|
58 |
+
|
59 |
+
protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
|
60 |
+
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
target_protocol=None,
|
64 |
+
cache_storage="TMP",
|
65 |
+
cache_check=10,
|
66 |
+
check_files=False,
|
67 |
+
expiry_time=604800,
|
68 |
+
target_options=None,
|
69 |
+
fs=None,
|
70 |
+
same_names: bool | None = None,
|
71 |
+
compression=None,
|
72 |
+
cache_mapper: AbstractCacheMapper | None = None,
|
73 |
+
**kwargs,
|
74 |
+
):
|
75 |
+
"""
|
76 |
+
|
77 |
+
Parameters
|
78 |
+
----------
|
79 |
+
target_protocol: str (optional)
|
80 |
+
Target filesystem protocol. Provide either this or ``fs``.
|
81 |
+
cache_storage: str or list(str)
|
82 |
+
Location to store files. If "TMP", this is a temporary directory,
|
83 |
+
and will be cleaned up by the OS when this process ends (or later).
|
84 |
+
If a list, each location will be tried in the order given, but
|
85 |
+
only the last will be considered writable.
|
86 |
+
cache_check: int
|
87 |
+
Number of seconds between reload of cache metadata
|
88 |
+
check_files: bool
|
89 |
+
Whether to explicitly see if the UID of the remote file matches
|
90 |
+
the stored one before using. Warning: some file systems such as
|
91 |
+
HTTP cannot reliably give a unique hash of the contents of some
|
92 |
+
path, so be sure to set this option to False.
|
93 |
+
expiry_time: int
|
94 |
+
The time in seconds after which a local copy is considered useless.
|
95 |
+
Set to falsy to prevent expiry. The default is equivalent to one
|
96 |
+
week.
|
97 |
+
target_options: dict or None
|
98 |
+
Passed to the instantiation of the FS, if fs is None.
|
99 |
+
fs: filesystem instance
|
100 |
+
The target filesystem to run against. Provide this or ``protocol``.
|
101 |
+
same_names: bool (optional)
|
102 |
+
By default, target URLs are hashed using a ``HashCacheMapper`` so
|
103 |
+
that files from different backends with the same basename do not
|
104 |
+
conflict. If this argument is ``true``, a ``BasenameCacheMapper``
|
105 |
+
is used instead. Other cache mapper options are available by using
|
106 |
+
the ``cache_mapper`` keyword argument. Only one of this and
|
107 |
+
``cache_mapper`` should be specified.
|
108 |
+
compression: str (optional)
|
109 |
+
To decompress on download. Can be 'infer' (guess from the URL name),
|
110 |
+
one of the entries in ``fsspec.compression.compr``, or None for no
|
111 |
+
decompression.
|
112 |
+
cache_mapper: AbstractCacheMapper (optional)
|
113 |
+
The object use to map from original filenames to cached filenames.
|
114 |
+
Only one of this and ``same_names`` should be specified.
|
115 |
+
"""
|
116 |
+
super().__init__(**kwargs)
|
117 |
+
if fs is None and target_protocol is None:
|
118 |
+
raise ValueError(
|
119 |
+
"Please provide filesystem instance(fs) or target_protocol"
|
120 |
+
)
|
121 |
+
if not (fs is None) ^ (target_protocol is None):
|
122 |
+
raise ValueError(
|
123 |
+
"Both filesystems (fs) and target_protocol may not be both given."
|
124 |
+
)
|
125 |
+
if cache_storage == "TMP":
|
126 |
+
tempdir = tempfile.mkdtemp()
|
127 |
+
storage = [tempdir]
|
128 |
+
weakref.finalize(self, self._remove_tempdir, tempdir)
|
129 |
+
else:
|
130 |
+
if isinstance(cache_storage, str):
|
131 |
+
storage = [cache_storage]
|
132 |
+
else:
|
133 |
+
storage = cache_storage
|
134 |
+
os.makedirs(storage[-1], exist_ok=True)
|
135 |
+
self.storage = storage
|
136 |
+
self.kwargs = target_options or {}
|
137 |
+
self.cache_check = cache_check
|
138 |
+
self.check_files = check_files
|
139 |
+
self.expiry = expiry_time
|
140 |
+
self.compression = compression
|
141 |
+
|
142 |
+
# Size of cache in bytes. If None then the size is unknown and will be
|
143 |
+
# recalculated the next time cache_size() is called. On writes to the
|
144 |
+
# cache this is reset to None.
|
145 |
+
self._cache_size = None
|
146 |
+
|
147 |
+
if same_names is not None and cache_mapper is not None:
|
148 |
+
raise ValueError(
|
149 |
+
"Cannot specify both same_names and cache_mapper in "
|
150 |
+
"CachingFileSystem.__init__"
|
151 |
+
)
|
152 |
+
if cache_mapper is not None:
|
153 |
+
self._mapper = cache_mapper
|
154 |
+
else:
|
155 |
+
self._mapper = create_cache_mapper(
|
156 |
+
same_names if same_names is not None else False
|
157 |
+
)
|
158 |
+
|
159 |
+
self.target_protocol = (
|
160 |
+
target_protocol
|
161 |
+
if isinstance(target_protocol, str)
|
162 |
+
else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
|
163 |
+
)
|
164 |
+
self._metadata = CacheMetadata(self.storage)
|
165 |
+
self.load_cache()
|
166 |
+
self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
|
167 |
+
|
168 |
+
def _strip_protocol(path):
|
169 |
+
# acts as a method, since each instance has a difference target
|
170 |
+
return self.fs._strip_protocol(type(self)._strip_protocol(path))
|
171 |
+
|
172 |
+
self._strip_protocol: Callable = _strip_protocol
|
173 |
+
|
174 |
+
@staticmethod
|
175 |
+
def _remove_tempdir(tempdir):
|
176 |
+
try:
|
177 |
+
rmtree(tempdir)
|
178 |
+
except Exception:
|
179 |
+
pass
|
180 |
+
|
181 |
+
def _mkcache(self):
|
182 |
+
os.makedirs(self.storage[-1], exist_ok=True)
|
183 |
+
|
184 |
+
def cache_size(self):
|
185 |
+
"""Return size of cache in bytes.
|
186 |
+
|
187 |
+
If more than one cache directory is in use, only the size of the last
|
188 |
+
one (the writable cache directory) is returned.
|
189 |
+
"""
|
190 |
+
if self._cache_size is None:
|
191 |
+
cache_dir = self.storage[-1]
|
192 |
+
self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
|
193 |
+
return self._cache_size
|
194 |
+
|
195 |
+
def load_cache(self):
|
196 |
+
"""Read set of stored blocks from file"""
|
197 |
+
self._metadata.load()
|
198 |
+
self._mkcache()
|
199 |
+
self.last_cache = time.time()
|
200 |
+
|
201 |
+
def save_cache(self):
|
202 |
+
"""Save set of stored blocks from file"""
|
203 |
+
self._mkcache()
|
204 |
+
self._metadata.save()
|
205 |
+
self.last_cache = time.time()
|
206 |
+
self._cache_size = None
|
207 |
+
|
208 |
+
def _check_cache(self):
|
209 |
+
"""Reload caches if time elapsed or any disappeared"""
|
210 |
+
self._mkcache()
|
211 |
+
if not self.cache_check:
|
212 |
+
# explicitly told not to bother checking
|
213 |
+
return
|
214 |
+
timecond = time.time() - self.last_cache > self.cache_check
|
215 |
+
existcond = all(os.path.exists(storage) for storage in self.storage)
|
216 |
+
if timecond or not existcond:
|
217 |
+
self.load_cache()
|
218 |
+
|
219 |
+
def _check_file(self, path):
|
220 |
+
"""Is path in cache and still valid"""
|
221 |
+
path = self._strip_protocol(path)
|
222 |
+
self._check_cache()
|
223 |
+
return self._metadata.check_file(path, self)
|
224 |
+
|
225 |
+
def clear_cache(self):
|
226 |
+
"""Remove all files and metadata from the cache
|
227 |
+
|
228 |
+
In the case of multiple cache locations, this clears only the last one,
|
229 |
+
which is assumed to be the read/write one.
|
230 |
+
"""
|
231 |
+
rmtree(self.storage[-1])
|
232 |
+
self.load_cache()
|
233 |
+
self._cache_size = None
|
234 |
+
|
235 |
+
def clear_expired_cache(self, expiry_time=None):
|
236 |
+
"""Remove all expired files and metadata from the cache
|
237 |
+
|
238 |
+
In the case of multiple cache locations, this clears only the last one,
|
239 |
+
which is assumed to be the read/write one.
|
240 |
+
|
241 |
+
Parameters
|
242 |
+
----------
|
243 |
+
expiry_time: int
|
244 |
+
The time in seconds after which a local copy is considered useless.
|
245 |
+
If not defined the default is equivalent to the attribute from the
|
246 |
+
file caching instantiation.
|
247 |
+
"""
|
248 |
+
|
249 |
+
if not expiry_time:
|
250 |
+
expiry_time = self.expiry
|
251 |
+
|
252 |
+
self._check_cache()
|
253 |
+
|
254 |
+
expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
|
255 |
+
for fn in expired_files:
|
256 |
+
if os.path.exists(fn):
|
257 |
+
os.remove(fn)
|
258 |
+
|
259 |
+
if writable_cache_empty:
|
260 |
+
rmtree(self.storage[-1])
|
261 |
+
self.load_cache()
|
262 |
+
|
263 |
+
self._cache_size = None
|
264 |
+
|
265 |
+
def pop_from_cache(self, path):
|
266 |
+
"""Remove cached version of given file
|
267 |
+
|
268 |
+
Deletes local copy of the given (remote) path. If it is found in a cache
|
269 |
+
location which is not the last, it is assumed to be read-only, and
|
270 |
+
raises PermissionError
|
271 |
+
"""
|
272 |
+
path = self._strip_protocol(path)
|
273 |
+
fn = self._metadata.pop_file(path)
|
274 |
+
if fn is not None:
|
275 |
+
os.remove(fn)
|
276 |
+
self._cache_size = None
|
277 |
+
|
278 |
+
def _open(
|
279 |
+
self,
|
280 |
+
path,
|
281 |
+
mode="rb",
|
282 |
+
block_size=None,
|
283 |
+
autocommit=True,
|
284 |
+
cache_options=None,
|
285 |
+
**kwargs,
|
286 |
+
):
|
287 |
+
"""Wrap the target _open
|
288 |
+
|
289 |
+
If the whole file exists in the cache, just open it locally and
|
290 |
+
return that.
|
291 |
+
|
292 |
+
Otherwise, open the file on the target FS, and make it have a mmap
|
293 |
+
cache pointing to the location which we determine, in our cache.
|
294 |
+
The ``blocks`` instance is shared, so as the mmap cache instance
|
295 |
+
updates, so does the entry in our ``cached_files`` attribute.
|
296 |
+
We monkey-patch this file, so that when it closes, we call
|
297 |
+
``close_and_update`` to save the state of the blocks.
|
298 |
+
"""
|
299 |
+
path = self._strip_protocol(path)
|
300 |
+
|
301 |
+
path = self.fs._strip_protocol(path)
|
302 |
+
if "r" not in mode:
|
303 |
+
return self.fs._open(
|
304 |
+
path,
|
305 |
+
mode=mode,
|
306 |
+
block_size=block_size,
|
307 |
+
autocommit=autocommit,
|
308 |
+
cache_options=cache_options,
|
309 |
+
**kwargs,
|
310 |
+
)
|
311 |
+
detail = self._check_file(path)
|
312 |
+
if detail:
|
313 |
+
# file is in cache
|
314 |
+
detail, fn = detail
|
315 |
+
hash, blocks = detail["fn"], detail["blocks"]
|
316 |
+
if blocks is True:
|
317 |
+
# stored file is complete
|
318 |
+
logger.debug("Opening local copy of %s", path)
|
319 |
+
return open(fn, mode)
|
320 |
+
# TODO: action where partial file exists in read-only cache
|
321 |
+
logger.debug("Opening partially cached copy of %s", path)
|
322 |
+
else:
|
323 |
+
hash = self._mapper(path)
|
324 |
+
fn = os.path.join(self.storage[-1], hash)
|
325 |
+
blocks = set()
|
326 |
+
detail = {
|
327 |
+
"original": path,
|
328 |
+
"fn": hash,
|
329 |
+
"blocks": blocks,
|
330 |
+
"time": time.time(),
|
331 |
+
"uid": self.fs.ukey(path),
|
332 |
+
}
|
333 |
+
self._metadata.update_file(path, detail)
|
334 |
+
logger.debug("Creating local sparse file for %s", path)
|
335 |
+
|
336 |
+
# call target filesystems open
|
337 |
+
self._mkcache()
|
338 |
+
f = self.fs._open(
|
339 |
+
path,
|
340 |
+
mode=mode,
|
341 |
+
block_size=block_size,
|
342 |
+
autocommit=autocommit,
|
343 |
+
cache_options=cache_options,
|
344 |
+
cache_type="none",
|
345 |
+
**kwargs,
|
346 |
+
)
|
347 |
+
if self.compression:
|
348 |
+
comp = (
|
349 |
+
infer_compression(path)
|
350 |
+
if self.compression == "infer"
|
351 |
+
else self.compression
|
352 |
+
)
|
353 |
+
f = compr[comp](f, mode="rb")
|
354 |
+
if "blocksize" in detail:
|
355 |
+
if detail["blocksize"] != f.blocksize:
|
356 |
+
raise BlocksizeMismatchError(
|
357 |
+
f"Cached file must be reopened with same block"
|
358 |
+
f" size as original (old: {detail['blocksize']},"
|
359 |
+
f" new {f.blocksize})"
|
360 |
+
)
|
361 |
+
else:
|
362 |
+
detail["blocksize"] = f.blocksize
|
363 |
+
f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
|
364 |
+
close = f.close
|
365 |
+
f.close = lambda: self.close_and_update(f, close)
|
366 |
+
self.save_cache()
|
367 |
+
return f
|
368 |
+
|
369 |
+
def _parent(self, path):
|
370 |
+
return self.fs._parent(path)
|
371 |
+
|
372 |
+
def hash_name(self, path: str, *args: Any) -> str:
|
373 |
+
# Kept for backward compatibility with downstream libraries.
|
374 |
+
# Ignores extra arguments, previously same_name boolean.
|
375 |
+
return self._mapper(path)
|
376 |
+
|
377 |
+
def close_and_update(self, f, close):
|
378 |
+
"""Called when a file is closing, so store the set of blocks"""
|
379 |
+
if f.closed:
|
380 |
+
return
|
381 |
+
path = self._strip_protocol(f.path)
|
382 |
+
self._metadata.on_close_cached_file(f, path)
|
383 |
+
try:
|
384 |
+
logger.debug("going to save")
|
385 |
+
self.save_cache()
|
386 |
+
logger.debug("saved")
|
387 |
+
except OSError:
|
388 |
+
logger.debug("Cache saving failed while closing file")
|
389 |
+
except NameError:
|
390 |
+
logger.debug("Cache save failed due to interpreter shutdown")
|
391 |
+
close()
|
392 |
+
f.closed = True
|
393 |
+
|
394 |
+
def __getattribute__(self, item):
|
395 |
+
if item in [
|
396 |
+
"load_cache",
|
397 |
+
"_open",
|
398 |
+
"save_cache",
|
399 |
+
"close_and_update",
|
400 |
+
"__init__",
|
401 |
+
"__getattribute__",
|
402 |
+
"__reduce__",
|
403 |
+
"_make_local_details",
|
404 |
+
"open",
|
405 |
+
"cat",
|
406 |
+
"cat_file",
|
407 |
+
"cat_ranges",
|
408 |
+
"get",
|
409 |
+
"read_block",
|
410 |
+
"tail",
|
411 |
+
"head",
|
412 |
+
"_check_file",
|
413 |
+
"_check_cache",
|
414 |
+
"_mkcache",
|
415 |
+
"clear_cache",
|
416 |
+
"clear_expired_cache",
|
417 |
+
"pop_from_cache",
|
418 |
+
"_mkcache",
|
419 |
+
"local_file",
|
420 |
+
"_paths_from_path",
|
421 |
+
"get_mapper",
|
422 |
+
"open_many",
|
423 |
+
"commit_many",
|
424 |
+
"hash_name",
|
425 |
+
"__hash__",
|
426 |
+
"__eq__",
|
427 |
+
"to_json",
|
428 |
+
"cache_size",
|
429 |
+
"pipe_file",
|
430 |
+
"pipe",
|
431 |
+
"start_transaction",
|
432 |
+
"end_transaction",
|
433 |
+
]:
|
434 |
+
# all the methods defined in this class. Note `open` here, since
|
435 |
+
# it calls `_open`, but is actually in superclass
|
436 |
+
return lambda *args, **kw: getattr(type(self), item).__get__(self)(
|
437 |
+
*args, **kw
|
438 |
+
)
|
439 |
+
if item in ["__reduce_ex__"]:
|
440 |
+
raise AttributeError
|
441 |
+
if item in ["transaction"]:
|
442 |
+
# property
|
443 |
+
return type(self).transaction.__get__(self)
|
444 |
+
if item in ["_cache", "transaction_type"]:
|
445 |
+
# class attributes
|
446 |
+
return getattr(type(self), item)
|
447 |
+
if item == "__class__":
|
448 |
+
return type(self)
|
449 |
+
d = object.__getattribute__(self, "__dict__")
|
450 |
+
fs = d.get("fs", None) # fs is not immediately defined
|
451 |
+
if item in d:
|
452 |
+
return d[item]
|
453 |
+
elif fs is not None:
|
454 |
+
if item in fs.__dict__:
|
455 |
+
# attribute of instance
|
456 |
+
return fs.__dict__[item]
|
457 |
+
# attributed belonging to the target filesystem
|
458 |
+
cls = type(fs)
|
459 |
+
m = getattr(cls, item)
|
460 |
+
if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
|
461 |
+
not hasattr(m, "__self__") or m.__self__ is None
|
462 |
+
):
|
463 |
+
# instance method
|
464 |
+
return m.__get__(fs, cls)
|
465 |
+
return m # class method or attribute
|
466 |
+
else:
|
467 |
+
# attributes of the superclass, while target is being set up
|
468 |
+
return super().__getattribute__(item)
|
469 |
+
|
470 |
+
def __eq__(self, other):
|
471 |
+
"""Test for equality."""
|
472 |
+
if self is other:
|
473 |
+
return True
|
474 |
+
if not isinstance(other, type(self)):
|
475 |
+
return False
|
476 |
+
return (
|
477 |
+
self.storage == other.storage
|
478 |
+
and self.kwargs == other.kwargs
|
479 |
+
and self.cache_check == other.cache_check
|
480 |
+
and self.check_files == other.check_files
|
481 |
+
and self.expiry == other.expiry
|
482 |
+
and self.compression == other.compression
|
483 |
+
and self._mapper == other._mapper
|
484 |
+
and self.target_protocol == other.target_protocol
|
485 |
+
)
|
486 |
+
|
487 |
+
def __hash__(self):
|
488 |
+
"""Calculate hash."""
|
489 |
+
return (
|
490 |
+
hash(tuple(self.storage))
|
491 |
+
^ hash(str(self.kwargs))
|
492 |
+
^ hash(self.cache_check)
|
493 |
+
^ hash(self.check_files)
|
494 |
+
^ hash(self.expiry)
|
495 |
+
^ hash(self.compression)
|
496 |
+
^ hash(self._mapper)
|
497 |
+
^ hash(self.target_protocol)
|
498 |
+
)
|
499 |
+
|
500 |
+
def to_json(self):
|
501 |
+
"""Calculate JSON representation.
|
502 |
+
|
503 |
+
Not implemented yet for CachingFileSystem.
|
504 |
+
"""
|
505 |
+
raise NotImplementedError(
|
506 |
+
"CachingFileSystem JSON representation not implemented"
|
507 |
+
)
|
508 |
+
|
509 |
+
|
510 |
+
class WholeFileCacheFileSystem(CachingFileSystem):
|
511 |
+
"""Caches whole remote files on first access
|
512 |
+
|
513 |
+
This class is intended as a layer over any other file system, and
|
514 |
+
will make a local copy of each file accessed, so that all subsequent
|
515 |
+
reads are local. This is similar to ``CachingFileSystem``, but without
|
516 |
+
the block-wise functionality and so can work even when sparse files
|
517 |
+
are not allowed. See its docstring for definition of the init
|
518 |
+
arguments.
|
519 |
+
|
520 |
+
The class still needs access to the remote store for listing files,
|
521 |
+
and may refresh cached files.
|
522 |
+
"""
|
523 |
+
|
524 |
+
protocol = "filecache"
|
525 |
+
local_file = True
|
526 |
+
|
527 |
+
def open_many(self, open_files):
|
528 |
+
paths = [of.path for of in open_files]
|
529 |
+
if "r" in open_files.mode:
|
530 |
+
self._mkcache()
|
531 |
+
else:
|
532 |
+
return [
|
533 |
+
LocalTempFile(
|
534 |
+
self.fs,
|
535 |
+
path,
|
536 |
+
mode=open_files.mode,
|
537 |
+
fn=os.path.join(self.storage[-1], self._mapper(path)),
|
538 |
+
)
|
539 |
+
for path in paths
|
540 |
+
]
|
541 |
+
|
542 |
+
if self.compression:
|
543 |
+
raise NotImplementedError
|
544 |
+
details = [self._check_file(sp) for sp in paths]
|
545 |
+
downpath = [p for p, d in zip(paths, details) if not d]
|
546 |
+
downfn0 = [
|
547 |
+
os.path.join(self.storage[-1], self._mapper(p))
|
548 |
+
for p, d in zip(paths, details)
|
549 |
+
] # keep these path names for opening later
|
550 |
+
downfn = [fn for fn, d in zip(downfn0, details) if not d]
|
551 |
+
if downpath:
|
552 |
+
# skip if all files are already cached and up to date
|
553 |
+
self.fs.get(downpath, downfn)
|
554 |
+
|
555 |
+
# update metadata - only happens when downloads are successful
|
556 |
+
newdetail = [
|
557 |
+
{
|
558 |
+
"original": path,
|
559 |
+
"fn": self._mapper(path),
|
560 |
+
"blocks": True,
|
561 |
+
"time": time.time(),
|
562 |
+
"uid": self.fs.ukey(path),
|
563 |
+
}
|
564 |
+
for path in downpath
|
565 |
+
]
|
566 |
+
for path, detail in zip(downpath, newdetail):
|
567 |
+
self._metadata.update_file(path, detail)
|
568 |
+
self.save_cache()
|
569 |
+
|
570 |
+
def firstpart(fn):
|
571 |
+
# helper to adapt both whole-file and simple-cache
|
572 |
+
return fn[1] if isinstance(fn, tuple) else fn
|
573 |
+
|
574 |
+
return [
|
575 |
+
open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
|
576 |
+
for fn0, fn1 in zip(details, downfn0)
|
577 |
+
]
|
578 |
+
|
579 |
+
def commit_many(self, open_files):
|
580 |
+
self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
|
581 |
+
[f.close() for f in open_files]
|
582 |
+
for f in open_files:
|
583 |
+
# in case autocommit is off, and so close did not already delete
|
584 |
+
try:
|
585 |
+
os.remove(f.name)
|
586 |
+
except FileNotFoundError:
|
587 |
+
pass
|
588 |
+
self._cache_size = None
|
589 |
+
|
590 |
+
def _make_local_details(self, path):
|
591 |
+
hash = self._mapper(path)
|
592 |
+
fn = os.path.join(self.storage[-1], hash)
|
593 |
+
detail = {
|
594 |
+
"original": path,
|
595 |
+
"fn": hash,
|
596 |
+
"blocks": True,
|
597 |
+
"time": time.time(),
|
598 |
+
"uid": self.fs.ukey(path),
|
599 |
+
}
|
600 |
+
self._metadata.update_file(path, detail)
|
601 |
+
logger.debug("Copying %s to local cache", path)
|
602 |
+
return fn
|
603 |
+
|
604 |
+
def cat(
|
605 |
+
self,
|
606 |
+
path,
|
607 |
+
recursive=False,
|
608 |
+
on_error="raise",
|
609 |
+
callback=_DEFAULT_CALLBACK,
|
610 |
+
**kwargs,
|
611 |
+
):
|
612 |
+
paths = self.expand_path(
|
613 |
+
path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
|
614 |
+
)
|
615 |
+
getpaths = []
|
616 |
+
storepaths = []
|
617 |
+
fns = []
|
618 |
+
out = {}
|
619 |
+
for p in paths.copy():
|
620 |
+
try:
|
621 |
+
detail = self._check_file(p)
|
622 |
+
if not detail:
|
623 |
+
fn = self._make_local_details(p)
|
624 |
+
getpaths.append(p)
|
625 |
+
storepaths.append(fn)
|
626 |
+
else:
|
627 |
+
detail, fn = detail if isinstance(detail, tuple) else (None, detail)
|
628 |
+
fns.append(fn)
|
629 |
+
except Exception as e:
|
630 |
+
if on_error == "raise":
|
631 |
+
raise
|
632 |
+
if on_error == "return":
|
633 |
+
out[p] = e
|
634 |
+
paths.remove(p)
|
635 |
+
|
636 |
+
if getpaths:
|
637 |
+
self.fs.get(getpaths, storepaths)
|
638 |
+
self.save_cache()
|
639 |
+
|
640 |
+
callback.set_size(len(paths))
|
641 |
+
for p, fn in zip(paths, fns):
|
642 |
+
with open(fn, "rb") as f:
|
643 |
+
out[p] = f.read()
|
644 |
+
callback.relative_update(1)
|
645 |
+
if isinstance(path, str) and len(paths) == 1 and recursive is False:
|
646 |
+
out = out[paths[0]]
|
647 |
+
return out
|
648 |
+
|
649 |
+
def _open(self, path, mode="rb", **kwargs):
|
650 |
+
path = self._strip_protocol(path)
|
651 |
+
if "r" not in mode:
|
652 |
+
fn = self._make_local_details(path)
|
653 |
+
return LocalTempFile(self, path, mode=mode, fn=fn)
|
654 |
+
detail = self._check_file(path)
|
655 |
+
if detail:
|
656 |
+
detail, fn = detail
|
657 |
+
_, blocks = detail["fn"], detail["blocks"]
|
658 |
+
if blocks is True:
|
659 |
+
logger.debug("Opening local copy of %s", path)
|
660 |
+
|
661 |
+
# In order to support downstream filesystems to be able to
|
662 |
+
# infer the compression from the original filename, like
|
663 |
+
# the `TarFileSystem`, let's extend the `io.BufferedReader`
|
664 |
+
# fileobject protocol by adding a dedicated attribute
|
665 |
+
# `original`.
|
666 |
+
f = open(fn, mode)
|
667 |
+
f.original = detail.get("original")
|
668 |
+
return f
|
669 |
+
else:
|
670 |
+
raise ValueError(
|
671 |
+
f"Attempt to open partially cached file {path}"
|
672 |
+
f" as a wholly cached file"
|
673 |
+
)
|
674 |
+
else:
|
675 |
+
fn = self._make_local_details(path)
|
676 |
+
kwargs["mode"] = mode
|
677 |
+
|
678 |
+
# call target filesystems open
|
679 |
+
self._mkcache()
|
680 |
+
if self.compression:
|
681 |
+
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
|
682 |
+
if isinstance(f, AbstractBufferedFile):
|
683 |
+
# want no type of caching if just downloading whole thing
|
684 |
+
f.cache = BaseCache(0, f.cache.fetcher, f.size)
|
685 |
+
comp = (
|
686 |
+
infer_compression(path)
|
687 |
+
if self.compression == "infer"
|
688 |
+
else self.compression
|
689 |
+
)
|
690 |
+
f = compr[comp](f, mode="rb")
|
691 |
+
data = True
|
692 |
+
while data:
|
693 |
+
block = getattr(f, "blocksize", 5 * 2**20)
|
694 |
+
data = f.read(block)
|
695 |
+
f2.write(data)
|
696 |
+
else:
|
697 |
+
self.fs.get_file(path, fn)
|
698 |
+
self.save_cache()
|
699 |
+
return self._open(path, mode)
|
700 |
+
|
701 |
+
|
702 |
+
class SimpleCacheFileSystem(WholeFileCacheFileSystem):
|
703 |
+
"""Caches whole remote files on first access
|
704 |
+
|
705 |
+
This class is intended as a layer over any other file system, and
|
706 |
+
will make a local copy of each file accessed, so that all subsequent
|
707 |
+
reads are local. This implementation only copies whole files, and
|
708 |
+
does not keep any metadata about the download time or file details.
|
709 |
+
It is therefore safer to use in multi-threaded/concurrent situations.
|
710 |
+
|
711 |
+
This is the only of the caching filesystems that supports write: you will
|
712 |
+
be given a real local open file, and upon close and commit, it will be
|
713 |
+
uploaded to the target filesystem; the writability or the target URL is
|
714 |
+
not checked until that time.
|
715 |
+
|
716 |
+
"""
|
717 |
+
|
718 |
+
protocol = "simplecache"
|
719 |
+
local_file = True
|
720 |
+
transaction_type = WriteCachedTransaction
|
721 |
+
|
722 |
+
def __init__(self, **kwargs):
|
723 |
+
kw = kwargs.copy()
|
724 |
+
for key in ["cache_check", "expiry_time", "check_files"]:
|
725 |
+
kw[key] = False
|
726 |
+
super().__init__(**kw)
|
727 |
+
for storage in self.storage:
|
728 |
+
if not os.path.exists(storage):
|
729 |
+
os.makedirs(storage, exist_ok=True)
|
730 |
+
|
731 |
+
def _check_file(self, path):
|
732 |
+
self._check_cache()
|
733 |
+
sha = self._mapper(path)
|
734 |
+
for storage in self.storage:
|
735 |
+
fn = os.path.join(storage, sha)
|
736 |
+
if os.path.exists(fn):
|
737 |
+
return fn
|
738 |
+
|
739 |
+
def save_cache(self):
|
740 |
+
pass
|
741 |
+
|
742 |
+
def load_cache(self):
|
743 |
+
pass
|
744 |
+
|
745 |
+
def pipe_file(self, path, value=None, **kwargs):
|
746 |
+
if self._intrans:
|
747 |
+
with self.open(path, "wb") as f:
|
748 |
+
f.write(value)
|
749 |
+
else:
|
750 |
+
super().pipe_file(path, value)
|
751 |
+
|
752 |
+
def pipe(self, path, value=None, **kwargs):
|
753 |
+
if isinstance(path, str):
|
754 |
+
self.pipe_file(self._strip_protocol(path), value, **kwargs)
|
755 |
+
elif isinstance(path, dict):
|
756 |
+
for k, v in path.items():
|
757 |
+
self.pipe_file(self._strip_protocol(k), v, **kwargs)
|
758 |
+
else:
|
759 |
+
raise ValueError("path must be str or dict")
|
760 |
+
|
761 |
+
def cat_ranges(
|
762 |
+
self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
|
763 |
+
):
|
764 |
+
lpaths = [self._check_file(p) for p in paths]
|
765 |
+
rpaths = [p for l, p in zip(lpaths, paths) if l is False]
|
766 |
+
lpaths = [l for l, p in zip(lpaths, paths) if l is False]
|
767 |
+
self.fs.get(rpaths, lpaths)
|
768 |
+
return super().cat_ranges(
|
769 |
+
paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
|
770 |
+
)
|
771 |
+
|
772 |
+
def _open(self, path, mode="rb", **kwargs):
|
773 |
+
path = self._strip_protocol(path)
|
774 |
+
sha = self._mapper(path)
|
775 |
+
|
776 |
+
if "r" not in mode:
|
777 |
+
fn = os.path.join(self.storage[-1], sha)
|
778 |
+
return LocalTempFile(
|
779 |
+
self, path, mode=mode, autocommit=not self._intrans, fn=fn
|
780 |
+
)
|
781 |
+
fn = self._check_file(path)
|
782 |
+
if fn:
|
783 |
+
return open(fn, mode)
|
784 |
+
|
785 |
+
fn = os.path.join(self.storage[-1], sha)
|
786 |
+
logger.debug("Copying %s to local cache", path)
|
787 |
+
kwargs["mode"] = mode
|
788 |
+
|
789 |
+
self._mkcache()
|
790 |
+
self._cache_size = None
|
791 |
+
if self.compression:
|
792 |
+
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
|
793 |
+
if isinstance(f, AbstractBufferedFile):
|
794 |
+
# want no type of caching if just downloading whole thing
|
795 |
+
f.cache = BaseCache(0, f.cache.fetcher, f.size)
|
796 |
+
comp = (
|
797 |
+
infer_compression(path)
|
798 |
+
if self.compression == "infer"
|
799 |
+
else self.compression
|
800 |
+
)
|
801 |
+
f = compr[comp](f, mode="rb")
|
802 |
+
data = True
|
803 |
+
while data:
|
804 |
+
block = getattr(f, "blocksize", 5 * 2**20)
|
805 |
+
data = f.read(block)
|
806 |
+
f2.write(data)
|
807 |
+
else:
|
808 |
+
self.fs.get_file(path, fn)
|
809 |
+
return self._open(path, mode)
|
810 |
+
|
811 |
+
|
812 |
+
class LocalTempFile:
|
813 |
+
"""A temporary local file, which will be uploaded on commit"""
|
814 |
+
|
815 |
+
def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0):
|
816 |
+
self.fn = fn
|
817 |
+
self.fh = open(fn, mode)
|
818 |
+
self.mode = mode
|
819 |
+
if seek:
|
820 |
+
self.fh.seek(seek)
|
821 |
+
self.path = path
|
822 |
+
self.fs = fs
|
823 |
+
self.closed = False
|
824 |
+
self.autocommit = autocommit
|
825 |
+
|
826 |
+
def __reduce__(self):
|
827 |
+
# always open in r+b to allow continuing writing at a location
|
828 |
+
return (
|
829 |
+
LocalTempFile,
|
830 |
+
(self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
|
831 |
+
)
|
832 |
+
|
833 |
+
def __enter__(self):
|
834 |
+
return self.fh
|
835 |
+
|
836 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
837 |
+
self.close()
|
838 |
+
|
839 |
+
def close(self):
|
840 |
+
if self.closed:
|
841 |
+
return
|
842 |
+
self.fh.close()
|
843 |
+
self.closed = True
|
844 |
+
if self.autocommit:
|
845 |
+
self.commit()
|
846 |
+
|
847 |
+
def discard(self):
|
848 |
+
self.fh.close()
|
849 |
+
os.remove(self.fn)
|
850 |
+
|
851 |
+
def commit(self):
|
852 |
+
self.fs.put(self.fn, self.path)
|
853 |
+
try:
|
854 |
+
os.remove(self.fn)
|
855 |
+
except (PermissionError, FileNotFoundError):
|
856 |
+
# file path may be held by new version of the file on windows
|
857 |
+
pass
|
858 |
+
|
859 |
+
@property
|
860 |
+
def name(self):
|
861 |
+
return self.fn
|
862 |
+
|
863 |
+
def __getattr__(self, item):
|
864 |
+
return getattr(self.fh, item)
|
lib/python3.11/site-packages/fsspec/implementations/dask.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dask
|
2 |
+
from distributed.client import Client, _get_global_client
|
3 |
+
from distributed.worker import Worker
|
4 |
+
|
5 |
+
from fsspec import filesystem
|
6 |
+
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
7 |
+
from fsspec.utils import infer_storage_options
|
8 |
+
|
9 |
+
|
10 |
+
def _get_client(client):
|
11 |
+
if client is None:
|
12 |
+
return _get_global_client()
|
13 |
+
elif isinstance(client, Client):
|
14 |
+
return client
|
15 |
+
else:
|
16 |
+
# e.g., connection string
|
17 |
+
return Client(client)
|
18 |
+
|
19 |
+
|
20 |
+
def _in_worker():
|
21 |
+
return bool(Worker._instances)
|
22 |
+
|
23 |
+
|
24 |
+
class DaskWorkerFileSystem(AbstractFileSystem):
|
25 |
+
"""View files accessible to a worker as any other remote file-system
|
26 |
+
|
27 |
+
When instances are run on the worker, uses the real filesystem. When
|
28 |
+
run on the client, they call the worker to provide information or data.
|
29 |
+
|
30 |
+
**Warning** this implementation is experimental, and read-only for now.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
|
35 |
+
):
|
36 |
+
super().__init__(**kwargs)
|
37 |
+
if not (fs is None) ^ (target_protocol is None):
|
38 |
+
raise ValueError(
|
39 |
+
"Please provide one of filesystem instance (fs) or"
|
40 |
+
" target_protocol, not both"
|
41 |
+
)
|
42 |
+
self.target_protocol = target_protocol
|
43 |
+
self.target_options = target_options
|
44 |
+
self.worker = None
|
45 |
+
self.client = client
|
46 |
+
self.fs = fs
|
47 |
+
self._determine_worker()
|
48 |
+
|
49 |
+
@staticmethod
|
50 |
+
def _get_kwargs_from_urls(path):
|
51 |
+
so = infer_storage_options(path)
|
52 |
+
if "host" in so and "port" in so:
|
53 |
+
return {"client": f"{so['host']}:{so['port']}"}
|
54 |
+
else:
|
55 |
+
return {}
|
56 |
+
|
57 |
+
def _determine_worker(self):
|
58 |
+
if _in_worker():
|
59 |
+
self.worker = True
|
60 |
+
if self.fs is None:
|
61 |
+
self.fs = filesystem(
|
62 |
+
self.target_protocol, **(self.target_options or {})
|
63 |
+
)
|
64 |
+
else:
|
65 |
+
self.worker = False
|
66 |
+
self.client = _get_client(self.client)
|
67 |
+
self.rfs = dask.delayed(self)
|
68 |
+
|
69 |
+
def mkdir(self, *args, **kwargs):
|
70 |
+
if self.worker:
|
71 |
+
self.fs.mkdir(*args, **kwargs)
|
72 |
+
else:
|
73 |
+
self.rfs.mkdir(*args, **kwargs).compute()
|
74 |
+
|
75 |
+
def rm(self, *args, **kwargs):
|
76 |
+
if self.worker:
|
77 |
+
self.fs.rm(*args, **kwargs)
|
78 |
+
else:
|
79 |
+
self.rfs.rm(*args, **kwargs).compute()
|
80 |
+
|
81 |
+
def copy(self, *args, **kwargs):
|
82 |
+
if self.worker:
|
83 |
+
self.fs.copy(*args, **kwargs)
|
84 |
+
else:
|
85 |
+
self.rfs.copy(*args, **kwargs).compute()
|
86 |
+
|
87 |
+
def mv(self, *args, **kwargs):
|
88 |
+
if self.worker:
|
89 |
+
self.fs.mv(*args, **kwargs)
|
90 |
+
else:
|
91 |
+
self.rfs.mv(*args, **kwargs).compute()
|
92 |
+
|
93 |
+
def ls(self, *args, **kwargs):
|
94 |
+
if self.worker:
|
95 |
+
return self.fs.ls(*args, **kwargs)
|
96 |
+
else:
|
97 |
+
return self.rfs.ls(*args, **kwargs).compute()
|
98 |
+
|
99 |
+
def _open(
|
100 |
+
self,
|
101 |
+
path,
|
102 |
+
mode="rb",
|
103 |
+
block_size=None,
|
104 |
+
autocommit=True,
|
105 |
+
cache_options=None,
|
106 |
+
**kwargs,
|
107 |
+
):
|
108 |
+
if self.worker:
|
109 |
+
return self.fs._open(
|
110 |
+
path,
|
111 |
+
mode=mode,
|
112 |
+
block_size=block_size,
|
113 |
+
autocommit=autocommit,
|
114 |
+
cache_options=cache_options,
|
115 |
+
**kwargs,
|
116 |
+
)
|
117 |
+
else:
|
118 |
+
return DaskFile(
|
119 |
+
fs=self,
|
120 |
+
path=path,
|
121 |
+
mode=mode,
|
122 |
+
block_size=block_size,
|
123 |
+
autocommit=autocommit,
|
124 |
+
cache_options=cache_options,
|
125 |
+
**kwargs,
|
126 |
+
)
|
127 |
+
|
128 |
+
def fetch_range(self, path, mode, start, end):
|
129 |
+
if self.worker:
|
130 |
+
with self._open(path, mode) as f:
|
131 |
+
f.seek(start)
|
132 |
+
return f.read(end - start)
|
133 |
+
else:
|
134 |
+
return self.rfs.fetch_range(path, mode, start, end).compute()
|
135 |
+
|
136 |
+
|
137 |
+
class DaskFile(AbstractBufferedFile):
|
138 |
+
def __init__(self, mode="rb", **kwargs):
|
139 |
+
if mode != "rb":
|
140 |
+
raise ValueError('Remote dask files can only be opened in "rb" mode')
|
141 |
+
super().__init__(**kwargs)
|
142 |
+
|
143 |
+
def _upload_chunk(self, final=False):
|
144 |
+
pass
|
145 |
+
|
146 |
+
def _initiate_upload(self):
|
147 |
+
"""Create remote file/upload"""
|
148 |
+
pass
|
149 |
+
|
150 |
+
def _fetch_range(self, start, end):
|
151 |
+
"""Get the specified set of bytes from remote"""
|
152 |
+
return self.fs.fetch_range(self.path, self.mode, start, end)
|
lib/python3.11/site-packages/fsspec/implementations/data.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import io
|
3 |
+
from urllib.parse import unquote
|
4 |
+
|
5 |
+
from fsspec import AbstractFileSystem
|
6 |
+
|
7 |
+
|
8 |
+
class DataFileSystem(AbstractFileSystem):
|
9 |
+
"""A handy decoder for data-URLs
|
10 |
+
|
11 |
+
Example
|
12 |
+
-------
|
13 |
+
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
|
14 |
+
... print(f.read())
|
15 |
+
b"Hello, World!"
|
16 |
+
|
17 |
+
"""
|
18 |
+
|
19 |
+
protocol = "data"
|
20 |
+
|
21 |
+
def __init__(self, **kwargs):
|
22 |
+
"""No parameters for this filesystem"""
|
23 |
+
super().__init__(**kwargs)
|
24 |
+
|
25 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
26 |
+
pref, data = path.split(",", 1)
|
27 |
+
if pref.endswith("base64"):
|
28 |
+
return base64.b64decode(data)[start:end]
|
29 |
+
return unquote(data).encode()[start:end]
|
30 |
+
|
31 |
+
def info(self, path, **kwargs):
|
32 |
+
pref, name = path.split(",", 1)
|
33 |
+
data = self.cat_file(path)
|
34 |
+
mime = pref.split(":", 1)[1].split(";", 1)[0]
|
35 |
+
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
|
36 |
+
|
37 |
+
def _open(
|
38 |
+
self,
|
39 |
+
path,
|
40 |
+
mode="rb",
|
41 |
+
block_size=None,
|
42 |
+
autocommit=True,
|
43 |
+
cache_options=None,
|
44 |
+
**kwargs,
|
45 |
+
):
|
46 |
+
if "r" not in mode:
|
47 |
+
raise ValueError("Read only filesystem")
|
48 |
+
return io.BytesIO(self.cat_file(path))
|
lib/python3.11/site-packages/fsspec/implementations/dbfs.py
ADDED
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import urllib
|
3 |
+
|
4 |
+
import requests
|
5 |
+
|
6 |
+
from fsspec import AbstractFileSystem
|
7 |
+
from fsspec.spec import AbstractBufferedFile
|
8 |
+
|
9 |
+
|
10 |
+
class DatabricksException(Exception):
|
11 |
+
"""
|
12 |
+
Helper class for exceptions raised in this module.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, error_code, message):
|
16 |
+
"""Create a new DatabricksException"""
|
17 |
+
super().__init__(message)
|
18 |
+
|
19 |
+
self.error_code = error_code
|
20 |
+
self.message = message
|
21 |
+
|
22 |
+
|
23 |
+
class DatabricksFileSystem(AbstractFileSystem):
|
24 |
+
"""
|
25 |
+
Get access to the Databricks filesystem implementation over HTTP.
|
26 |
+
Can be used inside and outside of a databricks cluster.
|
27 |
+
"""
|
28 |
+
|
29 |
+
def __init__(self, instance, token, **kwargs):
|
30 |
+
"""
|
31 |
+
Create a new DatabricksFileSystem.
|
32 |
+
|
33 |
+
Parameters
|
34 |
+
----------
|
35 |
+
instance: str
|
36 |
+
The instance URL of the databricks cluster.
|
37 |
+
For example for an Azure databricks cluster, this
|
38 |
+
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
|
39 |
+
token: str
|
40 |
+
Your personal token. Find out more
|
41 |
+
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
|
42 |
+
"""
|
43 |
+
self.instance = instance
|
44 |
+
self.token = token
|
45 |
+
|
46 |
+
self.session = requests.Session()
|
47 |
+
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
|
48 |
+
|
49 |
+
super().__init__(**kwargs)
|
50 |
+
|
51 |
+
def ls(self, path, detail=True):
|
52 |
+
"""
|
53 |
+
List the contents of the given path.
|
54 |
+
|
55 |
+
Parameters
|
56 |
+
----------
|
57 |
+
path: str
|
58 |
+
Absolute path
|
59 |
+
detail: bool
|
60 |
+
Return not only the list of filenames,
|
61 |
+
but also additional information on file sizes
|
62 |
+
and types.
|
63 |
+
"""
|
64 |
+
out = self._ls_from_cache(path)
|
65 |
+
if not out:
|
66 |
+
try:
|
67 |
+
r = self._send_to_api(
|
68 |
+
method="get", endpoint="list", json={"path": path}
|
69 |
+
)
|
70 |
+
except DatabricksException as e:
|
71 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
72 |
+
raise FileNotFoundError(e.message)
|
73 |
+
|
74 |
+
raise e
|
75 |
+
files = r["files"]
|
76 |
+
out = [
|
77 |
+
{
|
78 |
+
"name": o["path"],
|
79 |
+
"type": "directory" if o["is_dir"] else "file",
|
80 |
+
"size": o["file_size"],
|
81 |
+
}
|
82 |
+
for o in files
|
83 |
+
]
|
84 |
+
self.dircache[path] = out
|
85 |
+
|
86 |
+
if detail:
|
87 |
+
return out
|
88 |
+
return [o["name"] for o in out]
|
89 |
+
|
90 |
+
def makedirs(self, path, exist_ok=True):
|
91 |
+
"""
|
92 |
+
Create a given absolute path and all of its parents.
|
93 |
+
|
94 |
+
Parameters
|
95 |
+
----------
|
96 |
+
path: str
|
97 |
+
Absolute path to create
|
98 |
+
exist_ok: bool
|
99 |
+
If false, checks if the folder
|
100 |
+
exists before creating it (and raises an
|
101 |
+
Exception if this is the case)
|
102 |
+
"""
|
103 |
+
if not exist_ok:
|
104 |
+
try:
|
105 |
+
# If the following succeeds, the path is already present
|
106 |
+
self._send_to_api(
|
107 |
+
method="get", endpoint="get-status", json={"path": path}
|
108 |
+
)
|
109 |
+
raise FileExistsError(f"Path {path} already exists")
|
110 |
+
except DatabricksException as e:
|
111 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
112 |
+
pass
|
113 |
+
|
114 |
+
try:
|
115 |
+
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
|
116 |
+
except DatabricksException as e:
|
117 |
+
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
118 |
+
raise FileExistsError(e.message)
|
119 |
+
|
120 |
+
raise e
|
121 |
+
self.invalidate_cache(self._parent(path))
|
122 |
+
|
123 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
124 |
+
"""
|
125 |
+
Create a given absolute path and all of its parents.
|
126 |
+
|
127 |
+
Parameters
|
128 |
+
----------
|
129 |
+
path: str
|
130 |
+
Absolute path to create
|
131 |
+
create_parents: bool
|
132 |
+
Whether to create all parents or not.
|
133 |
+
"False" is not implemented so far.
|
134 |
+
"""
|
135 |
+
if not create_parents:
|
136 |
+
raise NotImplementedError
|
137 |
+
|
138 |
+
self.mkdirs(path, **kwargs)
|
139 |
+
|
140 |
+
def rm(self, path, recursive=False):
|
141 |
+
"""
|
142 |
+
Remove the file or folder at the given absolute path.
|
143 |
+
|
144 |
+
Parameters
|
145 |
+
----------
|
146 |
+
path: str
|
147 |
+
Absolute path what to remove
|
148 |
+
recursive: bool
|
149 |
+
Recursively delete all files in a folder.
|
150 |
+
"""
|
151 |
+
try:
|
152 |
+
self._send_to_api(
|
153 |
+
method="post",
|
154 |
+
endpoint="delete",
|
155 |
+
json={"path": path, "recursive": recursive},
|
156 |
+
)
|
157 |
+
except DatabricksException as e:
|
158 |
+
# This is not really an exception, it just means
|
159 |
+
# not everything was deleted so far
|
160 |
+
if e.error_code == "PARTIAL_DELETE":
|
161 |
+
self.rm(path=path, recursive=recursive)
|
162 |
+
elif e.error_code == "IO_ERROR":
|
163 |
+
# Using the same exception as the os module would use here
|
164 |
+
raise OSError(e.message)
|
165 |
+
|
166 |
+
raise e
|
167 |
+
self.invalidate_cache(self._parent(path))
|
168 |
+
|
169 |
+
def mv(self, source_path, destination_path, recursive=False, maxdepth=None):
|
170 |
+
"""
|
171 |
+
Move a source to a destination path.
|
172 |
+
|
173 |
+
A note from the original [databricks API manual]
|
174 |
+
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
|
175 |
+
|
176 |
+
When moving a large number of files the API call will time out after
|
177 |
+
approximately 60s, potentially resulting in partially moved data.
|
178 |
+
Therefore, for operations that move more than 10k files, we strongly
|
179 |
+
discourage using the DBFS REST API.
|
180 |
+
|
181 |
+
Parameters
|
182 |
+
----------
|
183 |
+
source_path: str
|
184 |
+
From where to move (absolute path)
|
185 |
+
destination_path: str
|
186 |
+
To where to move (absolute path)
|
187 |
+
recursive: bool
|
188 |
+
Not implemented to far.
|
189 |
+
maxdepth:
|
190 |
+
Not implemented to far.
|
191 |
+
"""
|
192 |
+
if recursive:
|
193 |
+
raise NotImplementedError
|
194 |
+
if maxdepth:
|
195 |
+
raise NotImplementedError
|
196 |
+
|
197 |
+
try:
|
198 |
+
self._send_to_api(
|
199 |
+
method="post",
|
200 |
+
endpoint="move",
|
201 |
+
json={"source_path": source_path, "destination_path": destination_path},
|
202 |
+
)
|
203 |
+
except DatabricksException as e:
|
204 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
205 |
+
raise FileNotFoundError(e.message)
|
206 |
+
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
|
207 |
+
raise FileExistsError(e.message)
|
208 |
+
|
209 |
+
raise e
|
210 |
+
self.invalidate_cache(self._parent(source_path))
|
211 |
+
self.invalidate_cache(self._parent(destination_path))
|
212 |
+
|
213 |
+
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
214 |
+
"""
|
215 |
+
Overwrite the base class method to make sure to create a DBFile.
|
216 |
+
All arguments are copied from the base method.
|
217 |
+
|
218 |
+
Only the default blocksize is allowed.
|
219 |
+
"""
|
220 |
+
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
|
221 |
+
|
222 |
+
def _send_to_api(self, method, endpoint, json):
|
223 |
+
"""
|
224 |
+
Send the given json to the DBFS API
|
225 |
+
using a get or post request (specified by the argument `method`).
|
226 |
+
|
227 |
+
Parameters
|
228 |
+
----------
|
229 |
+
method: str
|
230 |
+
Which http method to use for communication; "get" or "post".
|
231 |
+
endpoint: str
|
232 |
+
Where to send the request to (last part of the API URL)
|
233 |
+
json: dict
|
234 |
+
Dictionary of information to send
|
235 |
+
"""
|
236 |
+
if method == "post":
|
237 |
+
session_call = self.session.post
|
238 |
+
elif method == "get":
|
239 |
+
session_call = self.session.get
|
240 |
+
else:
|
241 |
+
raise ValueError(f"Do not understand method {method}")
|
242 |
+
|
243 |
+
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
|
244 |
+
|
245 |
+
r = session_call(url, json=json)
|
246 |
+
|
247 |
+
# The DBFS API will return a json, also in case of an exception.
|
248 |
+
# We want to preserve this information as good as possible.
|
249 |
+
try:
|
250 |
+
r.raise_for_status()
|
251 |
+
except requests.HTTPError as e:
|
252 |
+
# try to extract json error message
|
253 |
+
# if that fails, fall back to the original exception
|
254 |
+
try:
|
255 |
+
exception_json = e.response.json()
|
256 |
+
except Exception:
|
257 |
+
raise e
|
258 |
+
|
259 |
+
raise DatabricksException(**exception_json)
|
260 |
+
|
261 |
+
return r.json()
|
262 |
+
|
263 |
+
def _create_handle(self, path, overwrite=True):
|
264 |
+
"""
|
265 |
+
Internal function to create a handle, which can be used to
|
266 |
+
write blocks of a file to DBFS.
|
267 |
+
A handle has a unique identifier which needs to be passed
|
268 |
+
whenever written during this transaction.
|
269 |
+
The handle is active for 10 minutes - after that a new
|
270 |
+
write transaction needs to be created.
|
271 |
+
Make sure to close the handle after you are finished.
|
272 |
+
|
273 |
+
Parameters
|
274 |
+
----------
|
275 |
+
path: str
|
276 |
+
Absolute path for this file.
|
277 |
+
overwrite: bool
|
278 |
+
If a file already exist at this location, either overwrite
|
279 |
+
it or raise an exception.
|
280 |
+
"""
|
281 |
+
try:
|
282 |
+
r = self._send_to_api(
|
283 |
+
method="post",
|
284 |
+
endpoint="create",
|
285 |
+
json={"path": path, "overwrite": overwrite},
|
286 |
+
)
|
287 |
+
return r["handle"]
|
288 |
+
except DatabricksException as e:
|
289 |
+
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
290 |
+
raise FileExistsError(e.message)
|
291 |
+
|
292 |
+
raise e
|
293 |
+
|
294 |
+
def _close_handle(self, handle):
|
295 |
+
"""
|
296 |
+
Close a handle, which was opened by :func:`_create_handle`.
|
297 |
+
|
298 |
+
Parameters
|
299 |
+
----------
|
300 |
+
handle: str
|
301 |
+
Which handle to close.
|
302 |
+
"""
|
303 |
+
try:
|
304 |
+
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
|
305 |
+
except DatabricksException as e:
|
306 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
307 |
+
raise FileNotFoundError(e.message)
|
308 |
+
|
309 |
+
raise e
|
310 |
+
|
311 |
+
def _add_data(self, handle, data):
|
312 |
+
"""
|
313 |
+
Upload data to an already opened file handle
|
314 |
+
(opened by :func:`_create_handle`).
|
315 |
+
The maximal allowed data size is 1MB after
|
316 |
+
conversion to base64.
|
317 |
+
Remember to close the handle when you are finished.
|
318 |
+
|
319 |
+
Parameters
|
320 |
+
----------
|
321 |
+
handle: str
|
322 |
+
Which handle to upload data to.
|
323 |
+
data: bytes
|
324 |
+
Block of data to add to the handle.
|
325 |
+
"""
|
326 |
+
data = base64.b64encode(data).decode()
|
327 |
+
try:
|
328 |
+
self._send_to_api(
|
329 |
+
method="post",
|
330 |
+
endpoint="add-block",
|
331 |
+
json={"handle": handle, "data": data},
|
332 |
+
)
|
333 |
+
except DatabricksException as e:
|
334 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
335 |
+
raise FileNotFoundError(e.message)
|
336 |
+
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
|
337 |
+
raise ValueError(e.message)
|
338 |
+
|
339 |
+
raise e
|
340 |
+
|
341 |
+
def _get_data(self, path, start, end):
|
342 |
+
"""
|
343 |
+
Download data in bytes from a given absolute path in a block
|
344 |
+
from [start, start+length].
|
345 |
+
The maximum number of allowed bytes to read is 1MB.
|
346 |
+
|
347 |
+
Parameters
|
348 |
+
----------
|
349 |
+
path: str
|
350 |
+
Absolute path to download data from
|
351 |
+
start: int
|
352 |
+
Start position of the block
|
353 |
+
end: int
|
354 |
+
End position of the block
|
355 |
+
"""
|
356 |
+
try:
|
357 |
+
r = self._send_to_api(
|
358 |
+
method="get",
|
359 |
+
endpoint="read",
|
360 |
+
json={"path": path, "offset": start, "length": end - start},
|
361 |
+
)
|
362 |
+
return base64.b64decode(r["data"])
|
363 |
+
except DatabricksException as e:
|
364 |
+
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
365 |
+
raise FileNotFoundError(e.message)
|
366 |
+
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
|
367 |
+
raise ValueError(e.message)
|
368 |
+
|
369 |
+
raise e
|
370 |
+
|
371 |
+
def invalidate_cache(self, path=None):
|
372 |
+
if path is None:
|
373 |
+
self.dircache.clear()
|
374 |
+
else:
|
375 |
+
self.dircache.pop(path, None)
|
376 |
+
super().invalidate_cache(path)
|
377 |
+
|
378 |
+
|
379 |
+
class DatabricksFile(AbstractBufferedFile):
|
380 |
+
"""
|
381 |
+
Helper class for files referenced in the DatabricksFileSystem.
|
382 |
+
"""
|
383 |
+
|
384 |
+
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
|
385 |
+
|
386 |
+
def __init__(
|
387 |
+
self,
|
388 |
+
fs,
|
389 |
+
path,
|
390 |
+
mode="rb",
|
391 |
+
block_size="default",
|
392 |
+
autocommit=True,
|
393 |
+
cache_type="readahead",
|
394 |
+
cache_options=None,
|
395 |
+
**kwargs,
|
396 |
+
):
|
397 |
+
"""
|
398 |
+
Create a new instance of the DatabricksFile.
|
399 |
+
|
400 |
+
The blocksize needs to be the default one.
|
401 |
+
"""
|
402 |
+
if block_size is None or block_size == "default":
|
403 |
+
block_size = self.DEFAULT_BLOCK_SIZE
|
404 |
+
|
405 |
+
assert (
|
406 |
+
block_size == self.DEFAULT_BLOCK_SIZE
|
407 |
+
), f"Only the default block size is allowed, not {block_size}"
|
408 |
+
|
409 |
+
super().__init__(
|
410 |
+
fs,
|
411 |
+
path,
|
412 |
+
mode=mode,
|
413 |
+
block_size=block_size,
|
414 |
+
autocommit=autocommit,
|
415 |
+
cache_type=cache_type,
|
416 |
+
cache_options=cache_options or {},
|
417 |
+
**kwargs,
|
418 |
+
)
|
419 |
+
|
420 |
+
def _initiate_upload(self):
|
421 |
+
"""Internal function to start a file upload"""
|
422 |
+
self.handle = self.fs._create_handle(self.path)
|
423 |
+
|
424 |
+
def _upload_chunk(self, final=False):
|
425 |
+
"""Internal function to add a chunk of data to a started upload"""
|
426 |
+
self.buffer.seek(0)
|
427 |
+
data = self.buffer.getvalue()
|
428 |
+
|
429 |
+
data_chunks = [
|
430 |
+
data[start:end] for start, end in self._to_sized_blocks(len(data))
|
431 |
+
]
|
432 |
+
|
433 |
+
for data_chunk in data_chunks:
|
434 |
+
self.fs._add_data(handle=self.handle, data=data_chunk)
|
435 |
+
|
436 |
+
if final:
|
437 |
+
self.fs._close_handle(handle=self.handle)
|
438 |
+
return True
|
439 |
+
|
440 |
+
def _fetch_range(self, start, end):
|
441 |
+
"""Internal function to download a block of data"""
|
442 |
+
return_buffer = b""
|
443 |
+
length = end - start
|
444 |
+
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
|
445 |
+
return_buffer += self.fs._get_data(
|
446 |
+
path=self.path, start=chunk_start, end=chunk_end
|
447 |
+
)
|
448 |
+
|
449 |
+
return return_buffer
|
450 |
+
|
451 |
+
def _to_sized_blocks(self, length, start=0):
|
452 |
+
"""Helper function to split a range from 0 to total_length into bloksizes"""
|
453 |
+
end = start + length
|
454 |
+
for data_chunk in range(start, end, self.blocksize):
|
455 |
+
data_start = data_chunk
|
456 |
+
data_end = min(end, data_chunk + self.blocksize)
|
457 |
+
yield data_start, data_end
|
lib/python3.11/site-packages/fsspec/implementations/dirfs.py
ADDED
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .. import filesystem
|
2 |
+
from ..asyn import AsyncFileSystem
|
3 |
+
|
4 |
+
|
5 |
+
class DirFileSystem(AsyncFileSystem):
|
6 |
+
"""Directory prefix filesystem
|
7 |
+
|
8 |
+
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
9 |
+
is relative to the `path`. After performing the necessary paths operation it
|
10 |
+
delegates everything to the wrapped filesystem.
|
11 |
+
"""
|
12 |
+
|
13 |
+
protocol = "dir"
|
14 |
+
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
path=None,
|
18 |
+
fs=None,
|
19 |
+
fo=None,
|
20 |
+
target_protocol=None,
|
21 |
+
target_options=None,
|
22 |
+
**storage_options,
|
23 |
+
):
|
24 |
+
"""
|
25 |
+
Parameters
|
26 |
+
----------
|
27 |
+
path: str
|
28 |
+
Path to the directory.
|
29 |
+
fs: AbstractFileSystem
|
30 |
+
An instantiated filesystem to wrap.
|
31 |
+
target_protocol, target_options:
|
32 |
+
if fs is none, construct it from these
|
33 |
+
fo: str
|
34 |
+
Alternate for path; do not provide both
|
35 |
+
"""
|
36 |
+
super().__init__(**storage_options)
|
37 |
+
if fs is None:
|
38 |
+
fs = filesystem(protocol=target_protocol, **(target_options or {}))
|
39 |
+
if (path is not None) ^ (fo is not None) is False:
|
40 |
+
raise ValueError("Provide path or fo, not both")
|
41 |
+
path = path or fo
|
42 |
+
|
43 |
+
if self.asynchronous and not fs.async_impl:
|
44 |
+
raise ValueError("can't use asynchronous with non-async fs")
|
45 |
+
|
46 |
+
if fs.async_impl and self.asynchronous != fs.asynchronous:
|
47 |
+
raise ValueError("both dirfs and fs should be in the same sync/async mode")
|
48 |
+
|
49 |
+
self.path = fs._strip_protocol(path)
|
50 |
+
self.fs = fs
|
51 |
+
|
52 |
+
def _join(self, path):
|
53 |
+
if isinstance(path, str):
|
54 |
+
if not self.path:
|
55 |
+
return path
|
56 |
+
if not path:
|
57 |
+
return self.path
|
58 |
+
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
59 |
+
return [self._join(_path) for _path in path]
|
60 |
+
|
61 |
+
def _relpath(self, path):
|
62 |
+
if isinstance(path, str):
|
63 |
+
if not self.path:
|
64 |
+
return path
|
65 |
+
if path == self.path:
|
66 |
+
return ""
|
67 |
+
prefix = self.path + self.fs.sep
|
68 |
+
assert path.startswith(prefix)
|
69 |
+
return path[len(prefix) :]
|
70 |
+
return [self._relpath(_path) for _path in path]
|
71 |
+
|
72 |
+
# Wrappers below
|
73 |
+
|
74 |
+
@property
|
75 |
+
def sep(self):
|
76 |
+
return self.fs.sep
|
77 |
+
|
78 |
+
async def set_session(self, *args, **kwargs):
|
79 |
+
return await self.fs.set_session(*args, **kwargs)
|
80 |
+
|
81 |
+
async def _rm_file(self, path, **kwargs):
|
82 |
+
return await self.fs._rm_file(self._join(path), **kwargs)
|
83 |
+
|
84 |
+
def rm_file(self, path, **kwargs):
|
85 |
+
return self.fs.rm_file(self._join(path), **kwargs)
|
86 |
+
|
87 |
+
async def _rm(self, path, *args, **kwargs):
|
88 |
+
return await self.fs._rm(self._join(path), *args, **kwargs)
|
89 |
+
|
90 |
+
def rm(self, path, *args, **kwargs):
|
91 |
+
return self.fs.rm(self._join(path), *args, **kwargs)
|
92 |
+
|
93 |
+
async def _cp_file(self, path1, path2, **kwargs):
|
94 |
+
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
|
95 |
+
|
96 |
+
def cp_file(self, path1, path2, **kwargs):
|
97 |
+
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
|
98 |
+
|
99 |
+
async def _copy(
|
100 |
+
self,
|
101 |
+
path1,
|
102 |
+
path2,
|
103 |
+
*args,
|
104 |
+
**kwargs,
|
105 |
+
):
|
106 |
+
return await self.fs._copy(
|
107 |
+
self._join(path1),
|
108 |
+
self._join(path2),
|
109 |
+
*args,
|
110 |
+
**kwargs,
|
111 |
+
)
|
112 |
+
|
113 |
+
def copy(self, path1, path2, *args, **kwargs):
|
114 |
+
return self.fs.copy(
|
115 |
+
self._join(path1),
|
116 |
+
self._join(path2),
|
117 |
+
*args,
|
118 |
+
**kwargs,
|
119 |
+
)
|
120 |
+
|
121 |
+
async def _pipe(self, path, *args, **kwargs):
|
122 |
+
return await self.fs._pipe(self._join(path), *args, **kwargs)
|
123 |
+
|
124 |
+
def pipe(self, path, *args, **kwargs):
|
125 |
+
return self.fs.pipe(self._join(path), *args, **kwargs)
|
126 |
+
|
127 |
+
async def _cat_file(self, path, *args, **kwargs):
|
128 |
+
return await self.fs._cat_file(self._join(path), *args, **kwargs)
|
129 |
+
|
130 |
+
def cat_file(self, path, *args, **kwargs):
|
131 |
+
return self.fs.cat_file(self._join(path), *args, **kwargs)
|
132 |
+
|
133 |
+
async def _cat(self, path, *args, **kwargs):
|
134 |
+
ret = await self.fs._cat(
|
135 |
+
self._join(path),
|
136 |
+
*args,
|
137 |
+
**kwargs,
|
138 |
+
)
|
139 |
+
|
140 |
+
if isinstance(ret, dict):
|
141 |
+
return {self._relpath(key): value for key, value in ret.items()}
|
142 |
+
|
143 |
+
return ret
|
144 |
+
|
145 |
+
def cat(self, path, *args, **kwargs):
|
146 |
+
ret = self.fs.cat(
|
147 |
+
self._join(path),
|
148 |
+
*args,
|
149 |
+
**kwargs,
|
150 |
+
)
|
151 |
+
|
152 |
+
if isinstance(ret, dict):
|
153 |
+
return {self._relpath(key): value for key, value in ret.items()}
|
154 |
+
|
155 |
+
return ret
|
156 |
+
|
157 |
+
async def _put_file(self, lpath, rpath, **kwargs):
|
158 |
+
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
|
159 |
+
|
160 |
+
def put_file(self, lpath, rpath, **kwargs):
|
161 |
+
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
|
162 |
+
|
163 |
+
async def _put(
|
164 |
+
self,
|
165 |
+
lpath,
|
166 |
+
rpath,
|
167 |
+
*args,
|
168 |
+
**kwargs,
|
169 |
+
):
|
170 |
+
return await self.fs._put(
|
171 |
+
lpath,
|
172 |
+
self._join(rpath),
|
173 |
+
*args,
|
174 |
+
**kwargs,
|
175 |
+
)
|
176 |
+
|
177 |
+
def put(self, lpath, rpath, *args, **kwargs):
|
178 |
+
return self.fs.put(
|
179 |
+
lpath,
|
180 |
+
self._join(rpath),
|
181 |
+
*args,
|
182 |
+
**kwargs,
|
183 |
+
)
|
184 |
+
|
185 |
+
async def _get_file(self, rpath, lpath, **kwargs):
|
186 |
+
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
|
187 |
+
|
188 |
+
def get_file(self, rpath, lpath, **kwargs):
|
189 |
+
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
|
190 |
+
|
191 |
+
async def _get(self, rpath, *args, **kwargs):
|
192 |
+
return await self.fs._get(self._join(rpath), *args, **kwargs)
|
193 |
+
|
194 |
+
def get(self, rpath, *args, **kwargs):
|
195 |
+
return self.fs.get(self._join(rpath), *args, **kwargs)
|
196 |
+
|
197 |
+
async def _isfile(self, path):
|
198 |
+
return await self.fs._isfile(self._join(path))
|
199 |
+
|
200 |
+
def isfile(self, path):
|
201 |
+
return self.fs.isfile(self._join(path))
|
202 |
+
|
203 |
+
async def _isdir(self, path):
|
204 |
+
return await self.fs._isdir(self._join(path))
|
205 |
+
|
206 |
+
def isdir(self, path):
|
207 |
+
return self.fs.isdir(self._join(path))
|
208 |
+
|
209 |
+
async def _size(self, path):
|
210 |
+
return await self.fs._size(self._join(path))
|
211 |
+
|
212 |
+
def size(self, path):
|
213 |
+
return self.fs.size(self._join(path))
|
214 |
+
|
215 |
+
async def _exists(self, path):
|
216 |
+
return await self.fs._exists(self._join(path))
|
217 |
+
|
218 |
+
def exists(self, path):
|
219 |
+
return self.fs.exists(self._join(path))
|
220 |
+
|
221 |
+
async def _info(self, path, **kwargs):
|
222 |
+
return await self.fs._info(self._join(path), **kwargs)
|
223 |
+
|
224 |
+
def info(self, path, **kwargs):
|
225 |
+
return self.fs.info(self._join(path), **kwargs)
|
226 |
+
|
227 |
+
async def _ls(self, path, detail=True, **kwargs):
|
228 |
+
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
|
229 |
+
if detail:
|
230 |
+
out = []
|
231 |
+
for entry in ret:
|
232 |
+
entry = entry.copy()
|
233 |
+
entry["name"] = self._relpath(entry["name"])
|
234 |
+
out.append(entry)
|
235 |
+
return out
|
236 |
+
|
237 |
+
return self._relpath(ret)
|
238 |
+
|
239 |
+
def ls(self, path, detail=True, **kwargs):
|
240 |
+
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
|
241 |
+
if detail:
|
242 |
+
out = []
|
243 |
+
for entry in ret:
|
244 |
+
entry = entry.copy()
|
245 |
+
entry["name"] = self._relpath(entry["name"])
|
246 |
+
out.append(entry)
|
247 |
+
return out
|
248 |
+
|
249 |
+
return self._relpath(ret)
|
250 |
+
|
251 |
+
async def _walk(self, path, *args, **kwargs):
|
252 |
+
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
|
253 |
+
yield self._relpath(root), dirs, files
|
254 |
+
|
255 |
+
def walk(self, path, *args, **kwargs):
|
256 |
+
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
|
257 |
+
yield self._relpath(root), dirs, files
|
258 |
+
|
259 |
+
async def _glob(self, path, **kwargs):
|
260 |
+
detail = kwargs.get("detail", False)
|
261 |
+
ret = await self.fs._glob(self._join(path), **kwargs)
|
262 |
+
if detail:
|
263 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
264 |
+
return self._relpath(ret)
|
265 |
+
|
266 |
+
def glob(self, path, **kwargs):
|
267 |
+
detail = kwargs.get("detail", False)
|
268 |
+
ret = self.fs.glob(self._join(path), **kwargs)
|
269 |
+
if detail:
|
270 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
271 |
+
return self._relpath(ret)
|
272 |
+
|
273 |
+
async def _du(self, path, *args, **kwargs):
|
274 |
+
total = kwargs.get("total", True)
|
275 |
+
ret = await self.fs._du(self._join(path), *args, **kwargs)
|
276 |
+
if total:
|
277 |
+
return ret
|
278 |
+
|
279 |
+
return {self._relpath(path): size for path, size in ret.items()}
|
280 |
+
|
281 |
+
def du(self, path, *args, **kwargs):
|
282 |
+
total = kwargs.get("total", True)
|
283 |
+
ret = self.fs.du(self._join(path), *args, **kwargs)
|
284 |
+
if total:
|
285 |
+
return ret
|
286 |
+
|
287 |
+
return {self._relpath(path): size for path, size in ret.items()}
|
288 |
+
|
289 |
+
async def _find(self, path, *args, **kwargs):
|
290 |
+
detail = kwargs.get("detail", False)
|
291 |
+
ret = await self.fs._find(self._join(path), *args, **kwargs)
|
292 |
+
if detail:
|
293 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
294 |
+
return self._relpath(ret)
|
295 |
+
|
296 |
+
def find(self, path, *args, **kwargs):
|
297 |
+
detail = kwargs.get("detail", False)
|
298 |
+
ret = self.fs.find(self._join(path), *args, **kwargs)
|
299 |
+
if detail:
|
300 |
+
return {self._relpath(path): info for path, info in ret.items()}
|
301 |
+
return self._relpath(ret)
|
302 |
+
|
303 |
+
async def _expand_path(self, path, *args, **kwargs):
|
304 |
+
return self._relpath(
|
305 |
+
await self.fs._expand_path(self._join(path), *args, **kwargs)
|
306 |
+
)
|
307 |
+
|
308 |
+
def expand_path(self, path, *args, **kwargs):
|
309 |
+
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
|
310 |
+
|
311 |
+
async def _mkdir(self, path, *args, **kwargs):
|
312 |
+
return await self.fs._mkdir(self._join(path), *args, **kwargs)
|
313 |
+
|
314 |
+
def mkdir(self, path, *args, **kwargs):
|
315 |
+
return self.fs.mkdir(self._join(path), *args, **kwargs)
|
316 |
+
|
317 |
+
async def _makedirs(self, path, *args, **kwargs):
|
318 |
+
return await self.fs._makedirs(self._join(path), *args, **kwargs)
|
319 |
+
|
320 |
+
def makedirs(self, path, *args, **kwargs):
|
321 |
+
return self.fs.makedirs(self._join(path), *args, **kwargs)
|
322 |
+
|
323 |
+
def rmdir(self, path):
|
324 |
+
return self.fs.rmdir(self._join(path))
|
325 |
+
|
326 |
+
def mv_file(self, path1, path2, **kwargs):
|
327 |
+
return self.fs.mv_file(
|
328 |
+
self._join(path1),
|
329 |
+
self._join(path2),
|
330 |
+
**kwargs,
|
331 |
+
)
|
332 |
+
|
333 |
+
def touch(self, path, **kwargs):
|
334 |
+
return self.fs.touch(self._join(path), **kwargs)
|
335 |
+
|
336 |
+
def created(self, path):
|
337 |
+
return self.fs.created(self._join(path))
|
338 |
+
|
339 |
+
def modified(self, path):
|
340 |
+
return self.fs.modified(self._join(path))
|
341 |
+
|
342 |
+
def sign(self, path, *args, **kwargs):
|
343 |
+
return self.fs.sign(self._join(path), *args, **kwargs)
|
344 |
+
|
345 |
+
def __repr__(self):
|
346 |
+
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
|
347 |
+
|
348 |
+
def open(
|
349 |
+
self,
|
350 |
+
path,
|
351 |
+
*args,
|
352 |
+
**kwargs,
|
353 |
+
):
|
354 |
+
return self.fs.open(
|
355 |
+
self._join(path),
|
356 |
+
*args,
|
357 |
+
**kwargs,
|
358 |
+
)
|
lib/python3.11/site-packages/fsspec/implementations/ftp.py
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import uuid
|
4 |
+
import warnings
|
5 |
+
from ftplib import FTP, Error, error_perm
|
6 |
+
from typing import Any
|
7 |
+
|
8 |
+
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
9 |
+
from ..utils import infer_storage_options, isfilelike
|
10 |
+
|
11 |
+
|
12 |
+
class FTPFileSystem(AbstractFileSystem):
|
13 |
+
"""A filesystem over classic FTP"""
|
14 |
+
|
15 |
+
root_marker = "/"
|
16 |
+
cachable = False
|
17 |
+
protocol = "ftp"
|
18 |
+
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
host,
|
22 |
+
port=21,
|
23 |
+
username=None,
|
24 |
+
password=None,
|
25 |
+
acct=None,
|
26 |
+
block_size=None,
|
27 |
+
tempdir=None,
|
28 |
+
timeout=30,
|
29 |
+
encoding="utf-8",
|
30 |
+
**kwargs,
|
31 |
+
):
|
32 |
+
"""
|
33 |
+
You can use _get_kwargs_from_urls to get some kwargs from
|
34 |
+
a reasonable FTP url.
|
35 |
+
|
36 |
+
Authentication will be anonymous if username/password are not
|
37 |
+
given.
|
38 |
+
|
39 |
+
Parameters
|
40 |
+
----------
|
41 |
+
host: str
|
42 |
+
The remote server name/ip to connect to
|
43 |
+
port: int
|
44 |
+
Port to connect with
|
45 |
+
username: str or None
|
46 |
+
If authenticating, the user's identifier
|
47 |
+
password: str of None
|
48 |
+
User's password on the server, if using
|
49 |
+
acct: str or None
|
50 |
+
Some servers also need an "account" string for auth
|
51 |
+
block_size: int or None
|
52 |
+
If given, the read-ahead or write buffer size.
|
53 |
+
tempdir: str
|
54 |
+
Directory on remote to put temporary files when in a transaction
|
55 |
+
timeout: int
|
56 |
+
Timeout of the ftp connection in seconds
|
57 |
+
encoding: str
|
58 |
+
Encoding to use for directories and filenames in FTP connection
|
59 |
+
"""
|
60 |
+
super().__init__(**kwargs)
|
61 |
+
self.host = host
|
62 |
+
self.port = port
|
63 |
+
self.tempdir = tempdir or "/tmp"
|
64 |
+
self.cred = username, password, acct
|
65 |
+
self.timeout = timeout
|
66 |
+
self.encoding = encoding
|
67 |
+
if block_size is not None:
|
68 |
+
self.blocksize = block_size
|
69 |
+
else:
|
70 |
+
self.blocksize = 2**16
|
71 |
+
self._connect()
|
72 |
+
|
73 |
+
def _connect(self):
|
74 |
+
if sys.version_info >= (3, 9):
|
75 |
+
self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
|
76 |
+
elif self.encoding:
|
77 |
+
warnings.warn("`encoding` not supported for python<3.9, ignoring")
|
78 |
+
self.ftp = FTP(timeout=self.timeout)
|
79 |
+
else:
|
80 |
+
self.ftp = FTP(timeout=self.timeout)
|
81 |
+
self.ftp.connect(self.host, self.port)
|
82 |
+
self.ftp.login(*self.cred)
|
83 |
+
|
84 |
+
@classmethod
|
85 |
+
def _strip_protocol(cls, path):
|
86 |
+
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
|
87 |
+
|
88 |
+
@staticmethod
|
89 |
+
def _get_kwargs_from_urls(urlpath):
|
90 |
+
out = infer_storage_options(urlpath)
|
91 |
+
out.pop("path", None)
|
92 |
+
out.pop("protocol", None)
|
93 |
+
return out
|
94 |
+
|
95 |
+
def ls(self, path, detail=True, **kwargs):
|
96 |
+
path = self._strip_protocol(path)
|
97 |
+
out = []
|
98 |
+
if path not in self.dircache:
|
99 |
+
try:
|
100 |
+
try:
|
101 |
+
out = [
|
102 |
+
(fn, details)
|
103 |
+
for (fn, details) in self.ftp.mlsd(path)
|
104 |
+
if fn not in [".", ".."]
|
105 |
+
and details["type"] not in ["pdir", "cdir"]
|
106 |
+
]
|
107 |
+
except error_perm:
|
108 |
+
out = _mlsd2(self.ftp, path) # Not platform independent
|
109 |
+
for fn, details in out:
|
110 |
+
if path == "/":
|
111 |
+
path = "" # just for forming the names, below
|
112 |
+
details["name"] = "/".join([path, fn.lstrip("/")])
|
113 |
+
if details["type"] == "file":
|
114 |
+
details["size"] = int(details["size"])
|
115 |
+
else:
|
116 |
+
details["size"] = 0
|
117 |
+
if details["type"] == "dir":
|
118 |
+
details["type"] = "directory"
|
119 |
+
self.dircache[path] = out
|
120 |
+
except Error:
|
121 |
+
try:
|
122 |
+
info = self.info(path)
|
123 |
+
if info["type"] == "file":
|
124 |
+
out = [(path, info)]
|
125 |
+
except (Error, IndexError):
|
126 |
+
raise FileNotFoundError(path)
|
127 |
+
files = self.dircache.get(path, out)
|
128 |
+
if not detail:
|
129 |
+
return sorted([fn for fn, details in files])
|
130 |
+
return [details for fn, details in files]
|
131 |
+
|
132 |
+
def info(self, path, **kwargs):
|
133 |
+
# implement with direct method
|
134 |
+
path = self._strip_protocol(path)
|
135 |
+
if path == "/":
|
136 |
+
# special case, since this dir has no real entry
|
137 |
+
return {"name": "/", "size": 0, "type": "directory"}
|
138 |
+
files = self.ls(self._parent(path).lstrip("/"), True)
|
139 |
+
try:
|
140 |
+
out = [f for f in files if f["name"] == path][0]
|
141 |
+
except IndexError:
|
142 |
+
raise FileNotFoundError(path)
|
143 |
+
return out
|
144 |
+
|
145 |
+
def get_file(self, rpath, lpath, **kwargs):
|
146 |
+
if self.isdir(rpath):
|
147 |
+
if not os.path.exists(lpath):
|
148 |
+
os.mkdir(lpath)
|
149 |
+
return
|
150 |
+
if isfilelike(lpath):
|
151 |
+
outfile = lpath
|
152 |
+
else:
|
153 |
+
outfile = open(lpath, "wb")
|
154 |
+
|
155 |
+
def cb(x):
|
156 |
+
outfile.write(x)
|
157 |
+
|
158 |
+
self.ftp.retrbinary(
|
159 |
+
f"RETR {rpath}",
|
160 |
+
blocksize=self.blocksize,
|
161 |
+
callback=cb,
|
162 |
+
)
|
163 |
+
if not isfilelike(lpath):
|
164 |
+
outfile.close()
|
165 |
+
|
166 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
167 |
+
if end is not None:
|
168 |
+
return super().cat_file(path, start, end, **kwargs)
|
169 |
+
out = []
|
170 |
+
|
171 |
+
def cb(x):
|
172 |
+
out.append(x)
|
173 |
+
|
174 |
+
self.ftp.retrbinary(
|
175 |
+
f"RETR {path}",
|
176 |
+
blocksize=self.blocksize,
|
177 |
+
rest=start,
|
178 |
+
callback=cb,
|
179 |
+
)
|
180 |
+
return b"".join(out)
|
181 |
+
|
182 |
+
def _open(
|
183 |
+
self,
|
184 |
+
path,
|
185 |
+
mode="rb",
|
186 |
+
block_size=None,
|
187 |
+
cache_options=None,
|
188 |
+
autocommit=True,
|
189 |
+
**kwargs,
|
190 |
+
):
|
191 |
+
path = self._strip_protocol(path)
|
192 |
+
block_size = block_size or self.blocksize
|
193 |
+
return FTPFile(
|
194 |
+
self,
|
195 |
+
path,
|
196 |
+
mode=mode,
|
197 |
+
block_size=block_size,
|
198 |
+
tempdir=self.tempdir,
|
199 |
+
autocommit=autocommit,
|
200 |
+
cache_options=cache_options,
|
201 |
+
)
|
202 |
+
|
203 |
+
def _rm(self, path):
|
204 |
+
path = self._strip_protocol(path)
|
205 |
+
self.ftp.delete(path)
|
206 |
+
self.invalidate_cache(self._parent(path))
|
207 |
+
|
208 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
209 |
+
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
210 |
+
for p in reversed(paths):
|
211 |
+
if self.isfile(p):
|
212 |
+
self.rm_file(p)
|
213 |
+
else:
|
214 |
+
self.rmdir(p)
|
215 |
+
|
216 |
+
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
|
217 |
+
path = self._strip_protocol(path)
|
218 |
+
parent = self._parent(path)
|
219 |
+
if parent != self.root_marker and not self.exists(parent) and create_parents:
|
220 |
+
self.mkdir(parent, create_parents=create_parents)
|
221 |
+
|
222 |
+
self.ftp.mkd(path)
|
223 |
+
self.invalidate_cache(self._parent(path))
|
224 |
+
|
225 |
+
def makedirs(self, path: str, exist_ok: bool = False) -> None:
|
226 |
+
path = self._strip_protocol(path)
|
227 |
+
if self.exists(path):
|
228 |
+
# NB: "/" does not "exist" as it has no directory entry
|
229 |
+
if not exist_ok:
|
230 |
+
raise FileExistsError(f"{path} exists without `exist_ok`")
|
231 |
+
# exists_ok=True -> no-op
|
232 |
+
else:
|
233 |
+
self.mkdir(path, create_parents=True)
|
234 |
+
|
235 |
+
def rmdir(self, path):
|
236 |
+
path = self._strip_protocol(path)
|
237 |
+
self.ftp.rmd(path)
|
238 |
+
self.invalidate_cache(self._parent(path))
|
239 |
+
|
240 |
+
def mv(self, path1, path2, **kwargs):
|
241 |
+
path1 = self._strip_protocol(path1)
|
242 |
+
path2 = self._strip_protocol(path2)
|
243 |
+
self.ftp.rename(path1, path2)
|
244 |
+
self.invalidate_cache(self._parent(path1))
|
245 |
+
self.invalidate_cache(self._parent(path2))
|
246 |
+
|
247 |
+
def __del__(self):
|
248 |
+
self.ftp.close()
|
249 |
+
|
250 |
+
def invalidate_cache(self, path=None):
|
251 |
+
if path is None:
|
252 |
+
self.dircache.clear()
|
253 |
+
else:
|
254 |
+
self.dircache.pop(path, None)
|
255 |
+
super().invalidate_cache(path)
|
256 |
+
|
257 |
+
|
258 |
+
class TransferDone(Exception):
|
259 |
+
"""Internal exception to break out of transfer"""
|
260 |
+
|
261 |
+
pass
|
262 |
+
|
263 |
+
|
264 |
+
class FTPFile(AbstractBufferedFile):
|
265 |
+
"""Interact with a remote FTP file with read/write buffering"""
|
266 |
+
|
267 |
+
def __init__(
|
268 |
+
self,
|
269 |
+
fs,
|
270 |
+
path,
|
271 |
+
mode="rb",
|
272 |
+
block_size="default",
|
273 |
+
autocommit=True,
|
274 |
+
cache_type="readahead",
|
275 |
+
cache_options=None,
|
276 |
+
**kwargs,
|
277 |
+
):
|
278 |
+
super().__init__(
|
279 |
+
fs,
|
280 |
+
path,
|
281 |
+
mode=mode,
|
282 |
+
block_size=block_size,
|
283 |
+
autocommit=autocommit,
|
284 |
+
cache_type=cache_type,
|
285 |
+
cache_options=cache_options,
|
286 |
+
**kwargs,
|
287 |
+
)
|
288 |
+
if not autocommit:
|
289 |
+
self.target = self.path
|
290 |
+
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
|
291 |
+
|
292 |
+
def commit(self):
|
293 |
+
self.fs.mv(self.path, self.target)
|
294 |
+
|
295 |
+
def discard(self):
|
296 |
+
self.fs.rm(self.path)
|
297 |
+
|
298 |
+
def _fetch_range(self, start, end):
|
299 |
+
"""Get bytes between given byte limits
|
300 |
+
|
301 |
+
Implemented by raising an exception in the fetch callback when the
|
302 |
+
number of bytes received reaches the requested amount.
|
303 |
+
|
304 |
+
Will fail if the server does not respect the REST command on
|
305 |
+
retrieve requests.
|
306 |
+
"""
|
307 |
+
out = []
|
308 |
+
total = [0]
|
309 |
+
|
310 |
+
def callback(x):
|
311 |
+
total[0] += len(x)
|
312 |
+
if total[0] > end - start:
|
313 |
+
out.append(x[: (end - start) - total[0]])
|
314 |
+
if end < self.size:
|
315 |
+
raise TransferDone
|
316 |
+
else:
|
317 |
+
out.append(x)
|
318 |
+
|
319 |
+
if total[0] == end - start and end < self.size:
|
320 |
+
raise TransferDone
|
321 |
+
|
322 |
+
try:
|
323 |
+
self.fs.ftp.retrbinary(
|
324 |
+
f"RETR {self.path}",
|
325 |
+
blocksize=self.blocksize,
|
326 |
+
rest=start,
|
327 |
+
callback=callback,
|
328 |
+
)
|
329 |
+
except TransferDone:
|
330 |
+
try:
|
331 |
+
# stop transfer, we got enough bytes for this block
|
332 |
+
self.fs.ftp.abort()
|
333 |
+
self.fs.ftp.getmultiline()
|
334 |
+
except Error:
|
335 |
+
self.fs._connect()
|
336 |
+
|
337 |
+
return b"".join(out)
|
338 |
+
|
339 |
+
def _upload_chunk(self, final=False):
|
340 |
+
self.buffer.seek(0)
|
341 |
+
self.fs.ftp.storbinary(
|
342 |
+
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
|
343 |
+
)
|
344 |
+
return True
|
345 |
+
|
346 |
+
|
347 |
+
def _mlsd2(ftp, path="."):
|
348 |
+
"""
|
349 |
+
Fall back to using `dir` instead of `mlsd` if not supported.
|
350 |
+
|
351 |
+
This parses a Linux style `ls -l` response to `dir`, but the response may
|
352 |
+
be platform dependent.
|
353 |
+
|
354 |
+
Parameters
|
355 |
+
----------
|
356 |
+
ftp: ftplib.FTP
|
357 |
+
path: str
|
358 |
+
Expects to be given path, but defaults to ".".
|
359 |
+
"""
|
360 |
+
lines = []
|
361 |
+
minfo = []
|
362 |
+
ftp.dir(path, lines.append)
|
363 |
+
for line in lines:
|
364 |
+
line = line.split()
|
365 |
+
this = (
|
366 |
+
line[-1],
|
367 |
+
{
|
368 |
+
"modify": " ".join(line[5:8]),
|
369 |
+
"unix.owner": line[2],
|
370 |
+
"unix.group": line[3],
|
371 |
+
"unix.mode": line[0],
|
372 |
+
"size": line[4],
|
373 |
+
},
|
374 |
+
)
|
375 |
+
if "d" == this[1]["unix.mode"][0]:
|
376 |
+
this[1]["type"] = "dir"
|
377 |
+
else:
|
378 |
+
this[1]["type"] = "file"
|
379 |
+
minfo.append(this)
|
380 |
+
return minfo
|
lib/python3.11/site-packages/fsspec/implementations/git.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pygit2
|
4 |
+
|
5 |
+
from fsspec.spec import AbstractFileSystem
|
6 |
+
|
7 |
+
from .memory import MemoryFile
|
8 |
+
|
9 |
+
|
10 |
+
class GitFileSystem(AbstractFileSystem):
|
11 |
+
"""Browse the files of a local git repo at any hash/tag/branch
|
12 |
+
|
13 |
+
(experimental backend)
|
14 |
+
"""
|
15 |
+
|
16 |
+
root_marker = ""
|
17 |
+
cachable = True
|
18 |
+
|
19 |
+
def __init__(self, path=None, fo=None, ref=None, **kwargs):
|
20 |
+
"""
|
21 |
+
|
22 |
+
Parameters
|
23 |
+
----------
|
24 |
+
path: str (optional)
|
25 |
+
Local location of the repo (uses current directory if not given).
|
26 |
+
May be deprecated in favour of ``fo``. When used with a higher
|
27 |
+
level function such as fsspec.open(), may be of the form
|
28 |
+
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
|
29 |
+
file path should not contain "@" or ":").
|
30 |
+
fo: str (optional)
|
31 |
+
Same as ``path``, but passed as part of a chained URL. This one
|
32 |
+
takes precedence if both are given.
|
33 |
+
ref: str (optional)
|
34 |
+
Reference to work with, could be a hash, tag or branch name. Defaults
|
35 |
+
to current working tree. Note that ``ls`` and ``open`` also take hash,
|
36 |
+
so this becomes the default for those operations
|
37 |
+
kwargs
|
38 |
+
"""
|
39 |
+
super().__init__(**kwargs)
|
40 |
+
self.repo = pygit2.Repository(fo or path or os.getcwd())
|
41 |
+
self.ref = ref or "master"
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
def _strip_protocol(cls, path):
|
45 |
+
path = super()._strip_protocol(path).lstrip("/")
|
46 |
+
if ":" in path:
|
47 |
+
path = path.split(":", 1)[1]
|
48 |
+
if "@" in path:
|
49 |
+
path = path.split("@", 1)[1]
|
50 |
+
return path.lstrip("/")
|
51 |
+
|
52 |
+
def _path_to_object(self, path, ref):
|
53 |
+
comm, ref = self.repo.resolve_refish(ref or self.ref)
|
54 |
+
parts = path.split("/")
|
55 |
+
tree = comm.tree
|
56 |
+
for part in parts:
|
57 |
+
if part and isinstance(tree, pygit2.Tree):
|
58 |
+
tree = tree[part]
|
59 |
+
return tree
|
60 |
+
|
61 |
+
@staticmethod
|
62 |
+
def _get_kwargs_from_urls(path):
|
63 |
+
if path.startswith("git://"):
|
64 |
+
path = path[6:]
|
65 |
+
out = {}
|
66 |
+
if ":" in path:
|
67 |
+
out["path"], path = path.split(":", 1)
|
68 |
+
if "@" in path:
|
69 |
+
out["ref"], path = path.split("@", 1)
|
70 |
+
return out
|
71 |
+
|
72 |
+
def ls(self, path, detail=True, ref=None, **kwargs):
|
73 |
+
path = self._strip_protocol(path)
|
74 |
+
tree = self._path_to_object(path, ref)
|
75 |
+
if isinstance(tree, pygit2.Tree):
|
76 |
+
out = []
|
77 |
+
for obj in tree:
|
78 |
+
if isinstance(obj, pygit2.Tree):
|
79 |
+
out.append(
|
80 |
+
{
|
81 |
+
"type": "directory",
|
82 |
+
"name": "/".join([path, obj.name]).lstrip("/"),
|
83 |
+
"hex": obj.hex,
|
84 |
+
"mode": f"{obj.filemode:o}",
|
85 |
+
"size": 0,
|
86 |
+
}
|
87 |
+
)
|
88 |
+
else:
|
89 |
+
out.append(
|
90 |
+
{
|
91 |
+
"type": "file",
|
92 |
+
"name": "/".join([path, obj.name]).lstrip("/"),
|
93 |
+
"hex": obj.hex,
|
94 |
+
"mode": f"{obj.filemode:o}",
|
95 |
+
"size": obj.size,
|
96 |
+
}
|
97 |
+
)
|
98 |
+
else:
|
99 |
+
obj = tree
|
100 |
+
out = [
|
101 |
+
{
|
102 |
+
"type": "file",
|
103 |
+
"name": obj.name,
|
104 |
+
"hex": obj.hex,
|
105 |
+
"mode": f"{obj.filemode:o}",
|
106 |
+
"size": obj.size,
|
107 |
+
}
|
108 |
+
]
|
109 |
+
if detail:
|
110 |
+
return out
|
111 |
+
return [o["name"] for o in out]
|
112 |
+
|
113 |
+
def ukey(self, path, ref=None):
|
114 |
+
return self.info(path, ref=ref)["hex"]
|
115 |
+
|
116 |
+
def _open(
|
117 |
+
self,
|
118 |
+
path,
|
119 |
+
mode="rb",
|
120 |
+
block_size=None,
|
121 |
+
autocommit=True,
|
122 |
+
cache_options=None,
|
123 |
+
ref=None,
|
124 |
+
**kwargs,
|
125 |
+
):
|
126 |
+
obj = self._path_to_object(path, ref or self.ref)
|
127 |
+
return MemoryFile(data=obj.data)
|
lib/python3.11/site-packages/fsspec/implementations/github.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
from ..spec import AbstractFileSystem
|
4 |
+
from ..utils import infer_storage_options
|
5 |
+
from .memory import MemoryFile
|
6 |
+
|
7 |
+
# TODO: add GIST backend, would be very similar
|
8 |
+
|
9 |
+
|
10 |
+
class GithubFileSystem(AbstractFileSystem):
|
11 |
+
"""Interface to files in github
|
12 |
+
|
13 |
+
An instance of this class provides the files residing within a remote github
|
14 |
+
repository. You may specify a point in the repos history, by SHA, branch
|
15 |
+
or tag (default is current master).
|
16 |
+
|
17 |
+
Given that code files tend to be small, and that github does not support
|
18 |
+
retrieving partial content, we always fetch whole files.
|
19 |
+
|
20 |
+
When using fsspec.open, allows URIs of the form:
|
21 |
+
|
22 |
+
- "github://path/file", in which case you must specify org, repo and
|
23 |
+
may specify sha in the extra args
|
24 |
+
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
|
25 |
+
part of the URI
|
26 |
+
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
|
27 |
+
|
28 |
+
``sha`` can be the full or abbreviated hex of the commit you want to fetch
|
29 |
+
from, or a branch or tag name (so long as it doesn't contain special characters
|
30 |
+
like "/", "?", which would have to be HTTP-encoded).
|
31 |
+
|
32 |
+
For authorised access, you must provide username and token, which can be made
|
33 |
+
at https://github.com/settings/tokens
|
34 |
+
"""
|
35 |
+
|
36 |
+
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
|
37 |
+
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
|
38 |
+
protocol = "github"
|
39 |
+
|
40 |
+
def __init__(self, org, repo, sha=None, username=None, token=None, **kwargs):
|
41 |
+
super().__init__(**kwargs)
|
42 |
+
self.org = org
|
43 |
+
self.repo = repo
|
44 |
+
if (username is None) ^ (token is None):
|
45 |
+
raise ValueError("Auth required both username and token")
|
46 |
+
self.username = username
|
47 |
+
self.token = token
|
48 |
+
if sha is None:
|
49 |
+
# look up default branch (not necessarily "master")
|
50 |
+
u = "https://api.github.com/repos/{org}/{repo}"
|
51 |
+
r = requests.get(u.format(org=org, repo=repo), **self.kw)
|
52 |
+
r.raise_for_status()
|
53 |
+
sha = r.json()["default_branch"]
|
54 |
+
|
55 |
+
self.root = sha
|
56 |
+
self.ls("")
|
57 |
+
|
58 |
+
@property
|
59 |
+
def kw(self):
|
60 |
+
if self.username:
|
61 |
+
return {"auth": (self.username, self.token)}
|
62 |
+
return {}
|
63 |
+
|
64 |
+
@classmethod
|
65 |
+
def repos(cls, org_or_user, is_org=True):
|
66 |
+
"""List repo names for given org or user
|
67 |
+
|
68 |
+
This may become the top level of the FS
|
69 |
+
|
70 |
+
Parameters
|
71 |
+
----------
|
72 |
+
org_or_user: str
|
73 |
+
Name of the github org or user to query
|
74 |
+
is_org: bool (default True)
|
75 |
+
Whether the name is an organisation (True) or user (False)
|
76 |
+
|
77 |
+
Returns
|
78 |
+
-------
|
79 |
+
List of string
|
80 |
+
"""
|
81 |
+
r = requests.get(
|
82 |
+
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos"
|
83 |
+
)
|
84 |
+
r.raise_for_status()
|
85 |
+
return [repo["name"] for repo in r.json()]
|
86 |
+
|
87 |
+
@property
|
88 |
+
def tags(self):
|
89 |
+
"""Names of tags in the repo"""
|
90 |
+
r = requests.get(
|
91 |
+
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
|
92 |
+
**self.kw,
|
93 |
+
)
|
94 |
+
r.raise_for_status()
|
95 |
+
return [t["name"] for t in r.json()]
|
96 |
+
|
97 |
+
@property
|
98 |
+
def branches(self):
|
99 |
+
"""Names of branches in the repo"""
|
100 |
+
r = requests.get(
|
101 |
+
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
|
102 |
+
**self.kw,
|
103 |
+
)
|
104 |
+
r.raise_for_status()
|
105 |
+
return [t["name"] for t in r.json()]
|
106 |
+
|
107 |
+
@property
|
108 |
+
def refs(self):
|
109 |
+
"""Named references, tags and branches"""
|
110 |
+
return {"tags": self.tags, "branches": self.branches}
|
111 |
+
|
112 |
+
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
|
113 |
+
"""List files at given path
|
114 |
+
|
115 |
+
Parameters
|
116 |
+
----------
|
117 |
+
path: str
|
118 |
+
Location to list, relative to repo root
|
119 |
+
detail: bool
|
120 |
+
If True, returns list of dicts, one per file; if False, returns
|
121 |
+
list of full filenames only
|
122 |
+
sha: str (optional)
|
123 |
+
List at the given point in the repo history, branch or tag name or commit
|
124 |
+
SHA
|
125 |
+
_sha: str (optional)
|
126 |
+
List this specific tree object (used internally to descend into trees)
|
127 |
+
"""
|
128 |
+
path = self._strip_protocol(path)
|
129 |
+
if path == "":
|
130 |
+
_sha = sha or self.root
|
131 |
+
if _sha is None:
|
132 |
+
parts = path.rstrip("/").split("/")
|
133 |
+
so_far = ""
|
134 |
+
_sha = sha or self.root
|
135 |
+
for part in parts:
|
136 |
+
out = self.ls(so_far, True, sha=sha, _sha=_sha)
|
137 |
+
so_far += "/" + part if so_far else part
|
138 |
+
out = [o for o in out if o["name"] == so_far]
|
139 |
+
if not out:
|
140 |
+
raise FileNotFoundError(path)
|
141 |
+
out = out[0]
|
142 |
+
if out["type"] == "file":
|
143 |
+
if detail:
|
144 |
+
return [out]
|
145 |
+
else:
|
146 |
+
return path
|
147 |
+
_sha = out["sha"]
|
148 |
+
if path not in self.dircache or sha not in [self.root, None]:
|
149 |
+
r = requests.get(
|
150 |
+
self.url.format(org=self.org, repo=self.repo, sha=_sha), **self.kw
|
151 |
+
)
|
152 |
+
if r.status_code == 404:
|
153 |
+
raise FileNotFoundError(path)
|
154 |
+
r.raise_for_status()
|
155 |
+
types = {"blob": "file", "tree": "directory"}
|
156 |
+
out = [
|
157 |
+
{
|
158 |
+
"name": path + "/" + f["path"] if path else f["path"],
|
159 |
+
"mode": f["mode"],
|
160 |
+
"type": types[f["type"]],
|
161 |
+
"size": f.get("size", 0),
|
162 |
+
"sha": f["sha"],
|
163 |
+
}
|
164 |
+
for f in r.json()["tree"]
|
165 |
+
if f["type"] in types
|
166 |
+
]
|
167 |
+
if sha in [self.root, None]:
|
168 |
+
self.dircache[path] = out
|
169 |
+
else:
|
170 |
+
out = self.dircache[path]
|
171 |
+
if detail:
|
172 |
+
return out
|
173 |
+
else:
|
174 |
+
return sorted([f["name"] for f in out])
|
175 |
+
|
176 |
+
def invalidate_cache(self, path=None):
|
177 |
+
self.dircache.clear()
|
178 |
+
|
179 |
+
@classmethod
|
180 |
+
def _strip_protocol(cls, path):
|
181 |
+
opts = infer_storage_options(path)
|
182 |
+
if "username" not in opts:
|
183 |
+
return super()._strip_protocol(path)
|
184 |
+
return opts["path"].lstrip("/")
|
185 |
+
|
186 |
+
@staticmethod
|
187 |
+
def _get_kwargs_from_urls(path):
|
188 |
+
opts = infer_storage_options(path)
|
189 |
+
if "username" not in opts:
|
190 |
+
return {}
|
191 |
+
out = {"org": opts["username"], "repo": opts["password"]}
|
192 |
+
if opts["host"]:
|
193 |
+
out["sha"] = opts["host"]
|
194 |
+
return out
|
195 |
+
|
196 |
+
def _open(
|
197 |
+
self,
|
198 |
+
path,
|
199 |
+
mode="rb",
|
200 |
+
block_size=None,
|
201 |
+
autocommit=True,
|
202 |
+
cache_options=None,
|
203 |
+
sha=None,
|
204 |
+
**kwargs,
|
205 |
+
):
|
206 |
+
if mode != "rb":
|
207 |
+
raise NotImplementedError
|
208 |
+
url = self.rurl.format(
|
209 |
+
org=self.org, repo=self.repo, path=path, sha=sha or self.root
|
210 |
+
)
|
211 |
+
r = requests.get(url, **self.kw)
|
212 |
+
if r.status_code == 404:
|
213 |
+
raise FileNotFoundError(path)
|
214 |
+
r.raise_for_status()
|
215 |
+
return MemoryFile(None, None, r.content)
|
lib/python3.11/site-packages/fsspec/implementations/http.py
ADDED
@@ -0,0 +1,864 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import io
|
3 |
+
import logging
|
4 |
+
import re
|
5 |
+
import weakref
|
6 |
+
from copy import copy
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
|
9 |
+
import aiohttp
|
10 |
+
import requests
|
11 |
+
import yarl
|
12 |
+
|
13 |
+
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
|
14 |
+
from fsspec.callbacks import _DEFAULT_CALLBACK
|
15 |
+
from fsspec.exceptions import FSTimeoutError
|
16 |
+
from fsspec.spec import AbstractBufferedFile
|
17 |
+
from fsspec.utils import (
|
18 |
+
DEFAULT_BLOCK_SIZE,
|
19 |
+
glob_translate,
|
20 |
+
isfilelike,
|
21 |
+
nullcontext,
|
22 |
+
tokenize,
|
23 |
+
)
|
24 |
+
|
25 |
+
from ..caching import AllBytes
|
26 |
+
|
27 |
+
# https://stackoverflow.com/a/15926317/3821154
|
28 |
+
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
29 |
+
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
30 |
+
logger = logging.getLogger("fsspec.http")
|
31 |
+
|
32 |
+
|
33 |
+
async def get_client(**kwargs):
|
34 |
+
return aiohttp.ClientSession(**kwargs)
|
35 |
+
|
36 |
+
|
37 |
+
class HTTPFileSystem(AsyncFileSystem):
|
38 |
+
"""
|
39 |
+
Simple File-System for fetching data via HTTP(S)
|
40 |
+
|
41 |
+
``ls()`` is implemented by loading the parent page and doing a regex
|
42 |
+
match on the result. If simple_link=True, anything of the form
|
43 |
+
"http(s)://server.com/stuff?thing=other"; otherwise only links within
|
44 |
+
HTML href tags will be used.
|
45 |
+
"""
|
46 |
+
|
47 |
+
sep = "/"
|
48 |
+
|
49 |
+
def __init__(
|
50 |
+
self,
|
51 |
+
simple_links=True,
|
52 |
+
block_size=None,
|
53 |
+
same_scheme=True,
|
54 |
+
size_policy=None,
|
55 |
+
cache_type="bytes",
|
56 |
+
cache_options=None,
|
57 |
+
asynchronous=False,
|
58 |
+
loop=None,
|
59 |
+
client_kwargs=None,
|
60 |
+
get_client=get_client,
|
61 |
+
encoded=False,
|
62 |
+
**storage_options,
|
63 |
+
):
|
64 |
+
"""
|
65 |
+
NB: if this is called async, you must await set_client
|
66 |
+
|
67 |
+
Parameters
|
68 |
+
----------
|
69 |
+
block_size: int
|
70 |
+
Blocks to read bytes; if 0, will default to raw requests file-like
|
71 |
+
objects instead of HTTPFile instances
|
72 |
+
simple_links: bool
|
73 |
+
If True, will consider both HTML <a> tags and anything that looks
|
74 |
+
like a URL; if False, will consider only the former.
|
75 |
+
same_scheme: True
|
76 |
+
When doing ls/glob, if this is True, only consider paths that have
|
77 |
+
http/https matching the input URLs.
|
78 |
+
size_policy: this argument is deprecated
|
79 |
+
client_kwargs: dict
|
80 |
+
Passed to aiohttp.ClientSession, see
|
81 |
+
https://docs.aiohttp.org/en/stable/client_reference.html
|
82 |
+
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
83 |
+
get_client: Callable[..., aiohttp.ClientSession]
|
84 |
+
A callable which takes keyword arguments and constructs
|
85 |
+
an aiohttp.ClientSession. It's state will be managed by
|
86 |
+
the HTTPFileSystem class.
|
87 |
+
storage_options: key-value
|
88 |
+
Any other parameters passed on to requests
|
89 |
+
cache_type, cache_options: defaults used in open
|
90 |
+
"""
|
91 |
+
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
|
92 |
+
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
93 |
+
self.simple_links = simple_links
|
94 |
+
self.same_schema = same_scheme
|
95 |
+
self.cache_type = cache_type
|
96 |
+
self.cache_options = cache_options
|
97 |
+
self.client_kwargs = client_kwargs or {}
|
98 |
+
self.get_client = get_client
|
99 |
+
self.encoded = encoded
|
100 |
+
self.kwargs = storage_options
|
101 |
+
self._session = None
|
102 |
+
|
103 |
+
# Clean caching-related parameters from `storage_options`
|
104 |
+
# before propagating them as `request_options` through `self.kwargs`.
|
105 |
+
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
|
106 |
+
# it clearer.
|
107 |
+
request_options = copy(storage_options)
|
108 |
+
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
109 |
+
request_options.pop("listings_expiry_time", None)
|
110 |
+
request_options.pop("max_paths", None)
|
111 |
+
request_options.pop("skip_instance_cache", None)
|
112 |
+
self.kwargs = request_options
|
113 |
+
|
114 |
+
@property
|
115 |
+
def fsid(self):
|
116 |
+
return "http"
|
117 |
+
|
118 |
+
def encode_url(self, url):
|
119 |
+
return yarl.URL(url, encoded=self.encoded)
|
120 |
+
|
121 |
+
@staticmethod
|
122 |
+
def close_session(loop, session):
|
123 |
+
if loop is not None and loop.is_running():
|
124 |
+
try:
|
125 |
+
sync(loop, session.close, timeout=0.1)
|
126 |
+
return
|
127 |
+
except (TimeoutError, FSTimeoutError):
|
128 |
+
pass
|
129 |
+
connector = getattr(session, "_connector", None)
|
130 |
+
if connector is not None:
|
131 |
+
# close after loop is dead
|
132 |
+
connector._close()
|
133 |
+
|
134 |
+
async def set_session(self):
|
135 |
+
if self._session is None:
|
136 |
+
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
|
137 |
+
if not self.asynchronous:
|
138 |
+
weakref.finalize(self, self.close_session, self.loop, self._session)
|
139 |
+
return self._session
|
140 |
+
|
141 |
+
@classmethod
|
142 |
+
def _strip_protocol(cls, path):
|
143 |
+
"""For HTTP, we always want to keep the full URL"""
|
144 |
+
return path
|
145 |
+
|
146 |
+
@classmethod
|
147 |
+
def _parent(cls, path):
|
148 |
+
# override, since _strip_protocol is different for URLs
|
149 |
+
par = super()._parent(path)
|
150 |
+
if len(par) > 7: # "http://..."
|
151 |
+
return par
|
152 |
+
return ""
|
153 |
+
|
154 |
+
async def _ls_real(self, url, detail=True, **kwargs):
|
155 |
+
# ignoring URL-encoded arguments
|
156 |
+
kw = self.kwargs.copy()
|
157 |
+
kw.update(kwargs)
|
158 |
+
logger.debug(url)
|
159 |
+
session = await self.set_session()
|
160 |
+
async with session.get(self.encode_url(url), **self.kwargs) as r:
|
161 |
+
self._raise_not_found_for_status(r, url)
|
162 |
+
text = await r.text()
|
163 |
+
if self.simple_links:
|
164 |
+
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
165 |
+
else:
|
166 |
+
links = [u[2] for u in ex.findall(text)]
|
167 |
+
out = set()
|
168 |
+
parts = urlparse(url)
|
169 |
+
for l in links:
|
170 |
+
if isinstance(l, tuple):
|
171 |
+
l = l[1]
|
172 |
+
if l.startswith("/") and len(l) > 1:
|
173 |
+
# absolute URL on this server
|
174 |
+
l = f"{parts.scheme}://{parts.netloc}{l}"
|
175 |
+
if l.startswith("http"):
|
176 |
+
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
177 |
+
out.add(l)
|
178 |
+
elif l.replace("https", "http").startswith(
|
179 |
+
url.replace("https", "http").rstrip("/") + "/"
|
180 |
+
):
|
181 |
+
# allowed to cross http <-> https
|
182 |
+
out.add(l)
|
183 |
+
else:
|
184 |
+
if l not in ["..", "../"]:
|
185 |
+
# Ignore FTP-like "parent"
|
186 |
+
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
187 |
+
if not out and url.endswith("/"):
|
188 |
+
out = await self._ls_real(url.rstrip("/"), detail=False)
|
189 |
+
if detail:
|
190 |
+
return [
|
191 |
+
{
|
192 |
+
"name": u,
|
193 |
+
"size": None,
|
194 |
+
"type": "directory" if u.endswith("/") else "file",
|
195 |
+
}
|
196 |
+
for u in out
|
197 |
+
]
|
198 |
+
else:
|
199 |
+
return sorted(out)
|
200 |
+
|
201 |
+
async def _ls(self, url, detail=True, **kwargs):
|
202 |
+
if self.use_listings_cache and url in self.dircache:
|
203 |
+
out = self.dircache[url]
|
204 |
+
else:
|
205 |
+
out = await self._ls_real(url, detail=detail, **kwargs)
|
206 |
+
self.dircache[url] = out
|
207 |
+
return out
|
208 |
+
|
209 |
+
ls = sync_wrapper(_ls)
|
210 |
+
|
211 |
+
def _raise_not_found_for_status(self, response, url):
|
212 |
+
"""
|
213 |
+
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
214 |
+
"""
|
215 |
+
if response.status == 404:
|
216 |
+
raise FileNotFoundError(url)
|
217 |
+
response.raise_for_status()
|
218 |
+
|
219 |
+
async def _cat_file(self, url, start=None, end=None, **kwargs):
|
220 |
+
kw = self.kwargs.copy()
|
221 |
+
kw.update(kwargs)
|
222 |
+
logger.debug(url)
|
223 |
+
|
224 |
+
if start is not None or end is not None:
|
225 |
+
if start == end:
|
226 |
+
return b""
|
227 |
+
headers = kw.pop("headers", {}).copy()
|
228 |
+
|
229 |
+
headers["Range"] = await self._process_limits(url, start, end)
|
230 |
+
kw["headers"] = headers
|
231 |
+
session = await self.set_session()
|
232 |
+
async with session.get(self.encode_url(url), **kw) as r:
|
233 |
+
out = await r.read()
|
234 |
+
self._raise_not_found_for_status(r, url)
|
235 |
+
return out
|
236 |
+
|
237 |
+
async def _get_file(
|
238 |
+
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
|
239 |
+
):
|
240 |
+
kw = self.kwargs.copy()
|
241 |
+
kw.update(kwargs)
|
242 |
+
logger.debug(rpath)
|
243 |
+
session = await self.set_session()
|
244 |
+
async with session.get(self.encode_url(rpath), **kw) as r:
|
245 |
+
try:
|
246 |
+
size = int(r.headers["content-length"])
|
247 |
+
except (ValueError, KeyError):
|
248 |
+
size = None
|
249 |
+
|
250 |
+
callback.set_size(size)
|
251 |
+
self._raise_not_found_for_status(r, rpath)
|
252 |
+
if isfilelike(lpath):
|
253 |
+
outfile = lpath
|
254 |
+
else:
|
255 |
+
outfile = open(lpath, "wb")
|
256 |
+
|
257 |
+
try:
|
258 |
+
chunk = True
|
259 |
+
while chunk:
|
260 |
+
chunk = await r.content.read(chunk_size)
|
261 |
+
outfile.write(chunk)
|
262 |
+
callback.relative_update(len(chunk))
|
263 |
+
finally:
|
264 |
+
if not isfilelike(lpath):
|
265 |
+
outfile.close()
|
266 |
+
|
267 |
+
async def _put_file(
|
268 |
+
self,
|
269 |
+
lpath,
|
270 |
+
rpath,
|
271 |
+
chunk_size=5 * 2**20,
|
272 |
+
callback=_DEFAULT_CALLBACK,
|
273 |
+
method="post",
|
274 |
+
**kwargs,
|
275 |
+
):
|
276 |
+
async def gen_chunks():
|
277 |
+
# Support passing arbitrary file-like objects
|
278 |
+
# and use them instead of streams.
|
279 |
+
if isinstance(lpath, io.IOBase):
|
280 |
+
context = nullcontext(lpath)
|
281 |
+
use_seek = False # might not support seeking
|
282 |
+
else:
|
283 |
+
context = open(lpath, "rb")
|
284 |
+
use_seek = True
|
285 |
+
|
286 |
+
with context as f:
|
287 |
+
if use_seek:
|
288 |
+
callback.set_size(f.seek(0, 2))
|
289 |
+
f.seek(0)
|
290 |
+
else:
|
291 |
+
callback.set_size(getattr(f, "size", None))
|
292 |
+
|
293 |
+
chunk = f.read(chunk_size)
|
294 |
+
while chunk:
|
295 |
+
yield chunk
|
296 |
+
callback.relative_update(len(chunk))
|
297 |
+
chunk = f.read(chunk_size)
|
298 |
+
|
299 |
+
kw = self.kwargs.copy()
|
300 |
+
kw.update(kwargs)
|
301 |
+
session = await self.set_session()
|
302 |
+
|
303 |
+
method = method.lower()
|
304 |
+
if method not in ("post", "put"):
|
305 |
+
raise ValueError(
|
306 |
+
f"method has to be either 'post' or 'put', not: {method!r}"
|
307 |
+
)
|
308 |
+
|
309 |
+
meth = getattr(session, method)
|
310 |
+
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
311 |
+
self._raise_not_found_for_status(resp, rpath)
|
312 |
+
|
313 |
+
async def _exists(self, path, **kwargs):
|
314 |
+
kw = self.kwargs.copy()
|
315 |
+
kw.update(kwargs)
|
316 |
+
try:
|
317 |
+
logger.debug(path)
|
318 |
+
session = await self.set_session()
|
319 |
+
r = await session.get(self.encode_url(path), **kw)
|
320 |
+
async with r:
|
321 |
+
return r.status < 400
|
322 |
+
except (requests.HTTPError, aiohttp.ClientError):
|
323 |
+
return False
|
324 |
+
|
325 |
+
async def _isfile(self, path, **kwargs):
|
326 |
+
return await self._exists(path, **kwargs)
|
327 |
+
|
328 |
+
def _open(
|
329 |
+
self,
|
330 |
+
path,
|
331 |
+
mode="rb",
|
332 |
+
block_size=None,
|
333 |
+
autocommit=None, # XXX: This differs from the base class.
|
334 |
+
cache_type=None,
|
335 |
+
cache_options=None,
|
336 |
+
size=None,
|
337 |
+
**kwargs,
|
338 |
+
):
|
339 |
+
"""Make a file-like object
|
340 |
+
|
341 |
+
Parameters
|
342 |
+
----------
|
343 |
+
path: str
|
344 |
+
Full URL with protocol
|
345 |
+
mode: string
|
346 |
+
must be "rb"
|
347 |
+
block_size: int or None
|
348 |
+
Bytes to download in one request; use instance value if None. If
|
349 |
+
zero, will return a streaming Requests file-like instance.
|
350 |
+
kwargs: key-value
|
351 |
+
Any other parameters, passed to requests calls
|
352 |
+
"""
|
353 |
+
if mode != "rb":
|
354 |
+
raise NotImplementedError
|
355 |
+
block_size = block_size if block_size is not None else self.block_size
|
356 |
+
kw = self.kwargs.copy()
|
357 |
+
kw["asynchronous"] = self.asynchronous
|
358 |
+
kw.update(kwargs)
|
359 |
+
size = size or self.info(path, **kwargs)["size"]
|
360 |
+
session = sync(self.loop, self.set_session)
|
361 |
+
if block_size and size:
|
362 |
+
return HTTPFile(
|
363 |
+
self,
|
364 |
+
path,
|
365 |
+
session=session,
|
366 |
+
block_size=block_size,
|
367 |
+
mode=mode,
|
368 |
+
size=size,
|
369 |
+
cache_type=cache_type or self.cache_type,
|
370 |
+
cache_options=cache_options or self.cache_options,
|
371 |
+
loop=self.loop,
|
372 |
+
**kw,
|
373 |
+
)
|
374 |
+
else:
|
375 |
+
return HTTPStreamFile(
|
376 |
+
self,
|
377 |
+
path,
|
378 |
+
mode=mode,
|
379 |
+
loop=self.loop,
|
380 |
+
session=session,
|
381 |
+
**kw,
|
382 |
+
)
|
383 |
+
|
384 |
+
async def open_async(self, path, mode="rb", size=None, **kwargs):
|
385 |
+
session = await self.set_session()
|
386 |
+
if size is None:
|
387 |
+
try:
|
388 |
+
size = (await self._info(path, **kwargs))["size"]
|
389 |
+
except FileNotFoundError:
|
390 |
+
pass
|
391 |
+
return AsyncStreamFile(
|
392 |
+
self,
|
393 |
+
path,
|
394 |
+
loop=self.loop,
|
395 |
+
session=session,
|
396 |
+
size=size,
|
397 |
+
**kwargs,
|
398 |
+
)
|
399 |
+
|
400 |
+
def ukey(self, url):
|
401 |
+
"""Unique identifier; assume HTTP files are static, unchanging"""
|
402 |
+
return tokenize(url, self.kwargs, self.protocol)
|
403 |
+
|
404 |
+
async def _info(self, url, **kwargs):
|
405 |
+
"""Get info of URL
|
406 |
+
|
407 |
+
Tries to access location via HEAD, and then GET methods, but does
|
408 |
+
not fetch the data.
|
409 |
+
|
410 |
+
It is possible that the server does not supply any size information, in
|
411 |
+
which case size will be given as None (and certain operations on the
|
412 |
+
corresponding file will not work).
|
413 |
+
"""
|
414 |
+
info = {}
|
415 |
+
session = await self.set_session()
|
416 |
+
|
417 |
+
for policy in ["head", "get"]:
|
418 |
+
try:
|
419 |
+
info.update(
|
420 |
+
await _file_info(
|
421 |
+
self.encode_url(url),
|
422 |
+
size_policy=policy,
|
423 |
+
session=session,
|
424 |
+
**self.kwargs,
|
425 |
+
**kwargs,
|
426 |
+
)
|
427 |
+
)
|
428 |
+
if info.get("size") is not None:
|
429 |
+
break
|
430 |
+
except Exception as exc:
|
431 |
+
if policy == "get":
|
432 |
+
# If get failed, then raise a FileNotFoundError
|
433 |
+
raise FileNotFoundError(url) from exc
|
434 |
+
logger.debug(str(exc))
|
435 |
+
|
436 |
+
return {"name": url, "size": None, **info, "type": "file"}
|
437 |
+
|
438 |
+
async def _glob(self, path, maxdepth=None, **kwargs):
|
439 |
+
"""
|
440 |
+
Find files by glob-matching.
|
441 |
+
|
442 |
+
This implementation is idntical to the one in AbstractFileSystem,
|
443 |
+
but "?" is not considered as a character for globbing, because it is
|
444 |
+
so common in URLs, often identifying the "query" part.
|
445 |
+
"""
|
446 |
+
if maxdepth is not None and maxdepth < 1:
|
447 |
+
raise ValueError("maxdepth must be at least 1")
|
448 |
+
import re
|
449 |
+
|
450 |
+
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
|
451 |
+
path = self._strip_protocol(path)
|
452 |
+
append_slash_to_dirname = ends_with_slash or path.endswith("/**")
|
453 |
+
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
454 |
+
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
455 |
+
|
456 |
+
min_idx = min(idx_star, idx_brace)
|
457 |
+
|
458 |
+
detail = kwargs.pop("detail", False)
|
459 |
+
|
460 |
+
if not has_magic(path):
|
461 |
+
if await self._exists(path, **kwargs):
|
462 |
+
if not detail:
|
463 |
+
return [path]
|
464 |
+
else:
|
465 |
+
return {path: await self._info(path, **kwargs)}
|
466 |
+
else:
|
467 |
+
if not detail:
|
468 |
+
return [] # glob of non-existent returns empty
|
469 |
+
else:
|
470 |
+
return {}
|
471 |
+
elif "/" in path[:min_idx]:
|
472 |
+
min_idx = path[:min_idx].rindex("/")
|
473 |
+
root = path[: min_idx + 1]
|
474 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
475 |
+
else:
|
476 |
+
root = ""
|
477 |
+
depth = path[min_idx + 1 :].count("/") + 1
|
478 |
+
|
479 |
+
if "**" in path:
|
480 |
+
if maxdepth is not None:
|
481 |
+
idx_double_stars = path.find("**")
|
482 |
+
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
483 |
+
depth = depth - depth_double_stars + maxdepth
|
484 |
+
else:
|
485 |
+
depth = None
|
486 |
+
|
487 |
+
allpaths = await self._find(
|
488 |
+
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
489 |
+
)
|
490 |
+
|
491 |
+
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
|
492 |
+
pattern = re.compile(pattern)
|
493 |
+
|
494 |
+
out = {
|
495 |
+
p: info
|
496 |
+
for p, info in sorted(allpaths.items())
|
497 |
+
if pattern.match(
|
498 |
+
(
|
499 |
+
p + "/"
|
500 |
+
if append_slash_to_dirname and info["type"] == "directory"
|
501 |
+
else p
|
502 |
+
)
|
503 |
+
)
|
504 |
+
}
|
505 |
+
|
506 |
+
if detail:
|
507 |
+
return out
|
508 |
+
else:
|
509 |
+
return list(out)
|
510 |
+
|
511 |
+
async def _isdir(self, path):
|
512 |
+
# override, since all URLs are (also) files
|
513 |
+
try:
|
514 |
+
return bool(await self._ls(path))
|
515 |
+
except (FileNotFoundError, ValueError):
|
516 |
+
return False
|
517 |
+
|
518 |
+
|
519 |
+
class HTTPFile(AbstractBufferedFile):
|
520 |
+
"""
|
521 |
+
A file-like object pointing to a remove HTTP(S) resource
|
522 |
+
|
523 |
+
Supports only reading, with read-ahead of a predermined block-size.
|
524 |
+
|
525 |
+
In the case that the server does not supply the filesize, only reading of
|
526 |
+
the complete file in one go is supported.
|
527 |
+
|
528 |
+
Parameters
|
529 |
+
----------
|
530 |
+
url: str
|
531 |
+
Full URL of the remote resource, including the protocol
|
532 |
+
session: requests.Session or None
|
533 |
+
All calls will be made within this session, to avoid restarting
|
534 |
+
connections where the server allows this
|
535 |
+
block_size: int or None
|
536 |
+
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
537 |
+
configured for the FileSystem creating this file
|
538 |
+
size: None or int
|
539 |
+
If given, this is the size of the file in bytes, and we don't attempt
|
540 |
+
to call the server to find the value.
|
541 |
+
kwargs: all other key-values are passed to requests calls.
|
542 |
+
"""
|
543 |
+
|
544 |
+
def __init__(
|
545 |
+
self,
|
546 |
+
fs,
|
547 |
+
url,
|
548 |
+
session=None,
|
549 |
+
block_size=None,
|
550 |
+
mode="rb",
|
551 |
+
cache_type="bytes",
|
552 |
+
cache_options=None,
|
553 |
+
size=None,
|
554 |
+
loop=None,
|
555 |
+
asynchronous=False,
|
556 |
+
**kwargs,
|
557 |
+
):
|
558 |
+
if mode != "rb":
|
559 |
+
raise NotImplementedError("File mode not supported")
|
560 |
+
self.asynchronous = asynchronous
|
561 |
+
self.url = url
|
562 |
+
self.session = session
|
563 |
+
self.details = {"name": url, "size": size, "type": "file"}
|
564 |
+
super().__init__(
|
565 |
+
fs=fs,
|
566 |
+
path=url,
|
567 |
+
mode=mode,
|
568 |
+
block_size=block_size,
|
569 |
+
cache_type=cache_type,
|
570 |
+
cache_options=cache_options,
|
571 |
+
**kwargs,
|
572 |
+
)
|
573 |
+
self.loop = loop
|
574 |
+
|
575 |
+
def read(self, length=-1):
|
576 |
+
"""Read bytes from file
|
577 |
+
|
578 |
+
Parameters
|
579 |
+
----------
|
580 |
+
length: int
|
581 |
+
Read up to this many bytes. If negative, read all content to end of
|
582 |
+
file. If the server has not supplied the filesize, attempting to
|
583 |
+
read only part of the data will raise a ValueError.
|
584 |
+
"""
|
585 |
+
if (
|
586 |
+
(length < 0 and self.loc == 0) # explicit read all
|
587 |
+
# but not when the size is known and fits into a block anyways
|
588 |
+
and not (self.size is not None and self.size <= self.blocksize)
|
589 |
+
):
|
590 |
+
self._fetch_all()
|
591 |
+
if self.size is None:
|
592 |
+
if length < 0:
|
593 |
+
self._fetch_all()
|
594 |
+
else:
|
595 |
+
length = min(self.size - self.loc, length)
|
596 |
+
return super().read(length)
|
597 |
+
|
598 |
+
async def async_fetch_all(self):
|
599 |
+
"""Read whole file in one shot, without caching
|
600 |
+
|
601 |
+
This is only called when position is still at zero,
|
602 |
+
and read() is called without a byte-count.
|
603 |
+
"""
|
604 |
+
logger.debug(f"Fetch all for {self}")
|
605 |
+
if not isinstance(self.cache, AllBytes):
|
606 |
+
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
607 |
+
async with r:
|
608 |
+
r.raise_for_status()
|
609 |
+
out = await r.read()
|
610 |
+
self.cache = AllBytes(
|
611 |
+
size=len(out), fetcher=None, blocksize=None, data=out
|
612 |
+
)
|
613 |
+
self.size = len(out)
|
614 |
+
|
615 |
+
_fetch_all = sync_wrapper(async_fetch_all)
|
616 |
+
|
617 |
+
def _parse_content_range(self, headers):
|
618 |
+
"""Parse the Content-Range header"""
|
619 |
+
s = headers.get("Content-Range", "")
|
620 |
+
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
621 |
+
if not m:
|
622 |
+
return None, None, None
|
623 |
+
|
624 |
+
if m[1] == "*":
|
625 |
+
start = end = None
|
626 |
+
else:
|
627 |
+
start, end = [int(x) for x in m[1].split("-")]
|
628 |
+
total = None if m[2] == "*" else int(m[2])
|
629 |
+
return start, end, total
|
630 |
+
|
631 |
+
async def async_fetch_range(self, start, end):
|
632 |
+
"""Download a block of data
|
633 |
+
|
634 |
+
The expectation is that the server returns only the requested bytes,
|
635 |
+
with HTTP code 206. If this is not the case, we first check the headers,
|
636 |
+
and then stream the output - if the data size is bigger than we
|
637 |
+
requested, an exception is raised.
|
638 |
+
"""
|
639 |
+
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
640 |
+
kwargs = self.kwargs.copy()
|
641 |
+
headers = kwargs.pop("headers", {}).copy()
|
642 |
+
headers["Range"] = f"bytes={start}-{end - 1}"
|
643 |
+
logger.debug(f"{self.url} : {headers['Range']}")
|
644 |
+
r = await self.session.get(
|
645 |
+
self.fs.encode_url(self.url), headers=headers, **kwargs
|
646 |
+
)
|
647 |
+
async with r:
|
648 |
+
if r.status == 416:
|
649 |
+
# range request outside file
|
650 |
+
return b""
|
651 |
+
r.raise_for_status()
|
652 |
+
|
653 |
+
# If the server has handled the range request, it should reply
|
654 |
+
# with status 206 (partial content). But we'll guess that a suitable
|
655 |
+
# Content-Range header or a Content-Length no more than the
|
656 |
+
# requested range also mean we have got the desired range.
|
657 |
+
response_is_range = (
|
658 |
+
r.status == 206
|
659 |
+
or self._parse_content_range(r.headers)[0] == start
|
660 |
+
or int(r.headers.get("Content-Length", end + 1)) <= end - start
|
661 |
+
)
|
662 |
+
|
663 |
+
if response_is_range:
|
664 |
+
# partial content, as expected
|
665 |
+
out = await r.read()
|
666 |
+
elif start > 0:
|
667 |
+
raise ValueError(
|
668 |
+
"The HTTP server doesn't appear to support range requests. "
|
669 |
+
"Only reading this file from the beginning is supported. "
|
670 |
+
"Open with block_size=0 for a streaming file interface."
|
671 |
+
)
|
672 |
+
else:
|
673 |
+
# Response is not a range, but we want the start of the file,
|
674 |
+
# so we can read the required amount anyway.
|
675 |
+
cl = 0
|
676 |
+
out = []
|
677 |
+
while True:
|
678 |
+
chunk = await r.content.read(2**20)
|
679 |
+
# data size unknown, let's read until we have enough
|
680 |
+
if chunk:
|
681 |
+
out.append(chunk)
|
682 |
+
cl += len(chunk)
|
683 |
+
if cl > end - start:
|
684 |
+
break
|
685 |
+
else:
|
686 |
+
break
|
687 |
+
out = b"".join(out)[: end - start]
|
688 |
+
return out
|
689 |
+
|
690 |
+
_fetch_range = sync_wrapper(async_fetch_range)
|
691 |
+
|
692 |
+
def __reduce__(self):
|
693 |
+
return (
|
694 |
+
reopen,
|
695 |
+
(
|
696 |
+
self.fs,
|
697 |
+
self.url,
|
698 |
+
self.mode,
|
699 |
+
self.blocksize,
|
700 |
+
self.cache.name if self.cache else "none",
|
701 |
+
self.size,
|
702 |
+
),
|
703 |
+
)
|
704 |
+
|
705 |
+
|
706 |
+
def reopen(fs, url, mode, blocksize, cache_type, size=None):
|
707 |
+
return fs.open(
|
708 |
+
url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
|
709 |
+
)
|
710 |
+
|
711 |
+
|
712 |
+
magic_check = re.compile("([*[])")
|
713 |
+
|
714 |
+
|
715 |
+
def has_magic(s):
|
716 |
+
match = magic_check.search(s)
|
717 |
+
return match is not None
|
718 |
+
|
719 |
+
|
720 |
+
class HTTPStreamFile(AbstractBufferedFile):
|
721 |
+
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
|
722 |
+
self.asynchronous = kwargs.pop("asynchronous", False)
|
723 |
+
self.url = url
|
724 |
+
self.loop = loop
|
725 |
+
self.session = session
|
726 |
+
if mode != "rb":
|
727 |
+
raise ValueError
|
728 |
+
self.details = {"name": url, "size": None}
|
729 |
+
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
|
730 |
+
|
731 |
+
async def cor():
|
732 |
+
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
|
733 |
+
self.fs._raise_not_found_for_status(r, url)
|
734 |
+
return r
|
735 |
+
|
736 |
+
self.r = sync(self.loop, cor)
|
737 |
+
|
738 |
+
def seek(self, loc, whence=0):
|
739 |
+
if loc == 0 and whence == 1:
|
740 |
+
return
|
741 |
+
if loc == self.loc and whence == 0:
|
742 |
+
return
|
743 |
+
raise ValueError("Cannot seek streaming HTTP file")
|
744 |
+
|
745 |
+
async def _read(self, num=-1):
|
746 |
+
out = await self.r.content.read(num)
|
747 |
+
self.loc += len(out)
|
748 |
+
return out
|
749 |
+
|
750 |
+
read = sync_wrapper(_read)
|
751 |
+
|
752 |
+
async def _close(self):
|
753 |
+
self.r.close()
|
754 |
+
|
755 |
+
def close(self):
|
756 |
+
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
|
757 |
+
super().close()
|
758 |
+
|
759 |
+
def __reduce__(self):
|
760 |
+
return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
|
761 |
+
|
762 |
+
|
763 |
+
class AsyncStreamFile(AbstractAsyncStreamedFile):
|
764 |
+
def __init__(
|
765 |
+
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
|
766 |
+
):
|
767 |
+
self.url = url
|
768 |
+
self.session = session
|
769 |
+
self.r = None
|
770 |
+
if mode != "rb":
|
771 |
+
raise ValueError
|
772 |
+
self.details = {"name": url, "size": None}
|
773 |
+
self.kwargs = kwargs
|
774 |
+
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
|
775 |
+
self.size = size
|
776 |
+
|
777 |
+
async def read(self, num=-1):
|
778 |
+
if self.r is None:
|
779 |
+
r = await self.session.get(
|
780 |
+
self.fs.encode_url(self.url), **self.kwargs
|
781 |
+
).__aenter__()
|
782 |
+
self.fs._raise_not_found_for_status(r, self.url)
|
783 |
+
self.r = r
|
784 |
+
out = await self.r.content.read(num)
|
785 |
+
self.loc += len(out)
|
786 |
+
return out
|
787 |
+
|
788 |
+
async def close(self):
|
789 |
+
if self.r is not None:
|
790 |
+
self.r.close()
|
791 |
+
self.r = None
|
792 |
+
await super().close()
|
793 |
+
|
794 |
+
|
795 |
+
async def get_range(session, url, start, end, file=None, **kwargs):
|
796 |
+
# explicit get a range when we know it must be safe
|
797 |
+
kwargs = kwargs.copy()
|
798 |
+
headers = kwargs.pop("headers", {}).copy()
|
799 |
+
headers["Range"] = f"bytes={start}-{end - 1}"
|
800 |
+
r = await session.get(url, headers=headers, **kwargs)
|
801 |
+
r.raise_for_status()
|
802 |
+
async with r:
|
803 |
+
out = await r.read()
|
804 |
+
if file:
|
805 |
+
with open(file, "r+b") as f:
|
806 |
+
f.seek(start)
|
807 |
+
f.write(out)
|
808 |
+
else:
|
809 |
+
return out
|
810 |
+
|
811 |
+
|
812 |
+
async def _file_info(url, session, size_policy="head", **kwargs):
|
813 |
+
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
814 |
+
|
815 |
+
Default operation is to explicitly allow redirects and use encoding
|
816 |
+
'identity' (no compression) to get the true size of the target.
|
817 |
+
"""
|
818 |
+
logger.debug("Retrieve file size for %s", url)
|
819 |
+
kwargs = kwargs.copy()
|
820 |
+
ar = kwargs.pop("allow_redirects", True)
|
821 |
+
head = kwargs.get("headers", {}).copy()
|
822 |
+
head["Accept-Encoding"] = "identity"
|
823 |
+
kwargs["headers"] = head
|
824 |
+
|
825 |
+
info = {}
|
826 |
+
if size_policy == "head":
|
827 |
+
r = await session.head(url, allow_redirects=ar, **kwargs)
|
828 |
+
elif size_policy == "get":
|
829 |
+
r = await session.get(url, allow_redirects=ar, **kwargs)
|
830 |
+
else:
|
831 |
+
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
832 |
+
async with r:
|
833 |
+
r.raise_for_status()
|
834 |
+
|
835 |
+
# TODO:
|
836 |
+
# recognise lack of 'Accept-Ranges',
|
837 |
+
# or 'Accept-Ranges': 'none' (not 'bytes')
|
838 |
+
# to mean streaming only, no random access => return None
|
839 |
+
if "Content-Length" in r.headers:
|
840 |
+
# Some servers may choose to ignore Accept-Encoding and return
|
841 |
+
# compressed content, in which case the returned size is unreliable.
|
842 |
+
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
|
843 |
+
"identity",
|
844 |
+
"",
|
845 |
+
]:
|
846 |
+
info["size"] = int(r.headers["Content-Length"])
|
847 |
+
elif "Content-Range" in r.headers:
|
848 |
+
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
849 |
+
|
850 |
+
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
851 |
+
if r.headers.get(checksum_field):
|
852 |
+
info[checksum_field] = r.headers[checksum_field]
|
853 |
+
|
854 |
+
return info
|
855 |
+
|
856 |
+
|
857 |
+
async def _file_size(url, session=None, *args, **kwargs):
|
858 |
+
if session is None:
|
859 |
+
session = await get_client()
|
860 |
+
info = await _file_info(url, session=session, *args, **kwargs)
|
861 |
+
return info.get("size")
|
862 |
+
|
863 |
+
|
864 |
+
file_size = sync_wrapper(_file_size)
|
lib/python3.11/site-packages/fsspec/implementations/jupyter.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import io
|
3 |
+
import re
|
4 |
+
|
5 |
+
import requests
|
6 |
+
|
7 |
+
import fsspec
|
8 |
+
|
9 |
+
|
10 |
+
class JupyterFileSystem(fsspec.AbstractFileSystem):
|
11 |
+
"""View of the files as seen by a Jupyter server (notebook or lab)"""
|
12 |
+
|
13 |
+
protocol = ("jupyter", "jlab")
|
14 |
+
|
15 |
+
def __init__(self, url, tok=None, **kwargs):
|
16 |
+
"""
|
17 |
+
|
18 |
+
Parameters
|
19 |
+
----------
|
20 |
+
url : str
|
21 |
+
Base URL of the server, like "http://127.0.0.1:8888". May include
|
22 |
+
token in the string, which is given by the process when starting up
|
23 |
+
tok : str
|
24 |
+
If the token is obtained separately, can be given here
|
25 |
+
kwargs
|
26 |
+
"""
|
27 |
+
if "?" in url:
|
28 |
+
if tok is None:
|
29 |
+
try:
|
30 |
+
tok = re.findall("token=([a-z0-9]+)", url)[0]
|
31 |
+
except IndexError as e:
|
32 |
+
raise ValueError("Could not determine token") from e
|
33 |
+
url = url.split("?", 1)[0]
|
34 |
+
self.url = url.rstrip("/") + "/api/contents"
|
35 |
+
self.session = requests.Session()
|
36 |
+
if tok:
|
37 |
+
self.session.headers["Authorization"] = f"token {tok}"
|
38 |
+
|
39 |
+
super().__init__(**kwargs)
|
40 |
+
|
41 |
+
def ls(self, path, detail=True, **kwargs):
|
42 |
+
path = self._strip_protocol(path)
|
43 |
+
r = self.session.get(f"{self.url}/{path}")
|
44 |
+
if r.status_code == 404:
|
45 |
+
return FileNotFoundError(path)
|
46 |
+
r.raise_for_status()
|
47 |
+
out = r.json()
|
48 |
+
|
49 |
+
if out["type"] == "directory":
|
50 |
+
out = out["content"]
|
51 |
+
else:
|
52 |
+
out = [out]
|
53 |
+
for o in out:
|
54 |
+
o["name"] = o.pop("path")
|
55 |
+
o.pop("content")
|
56 |
+
if o["type"] == "notebook":
|
57 |
+
o["type"] = "file"
|
58 |
+
if detail:
|
59 |
+
return out
|
60 |
+
return [o["name"] for o in out]
|
61 |
+
|
62 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
63 |
+
path = self._strip_protocol(path)
|
64 |
+
r = self.session.get(f"{self.url}/{path}")
|
65 |
+
if r.status_code == 404:
|
66 |
+
return FileNotFoundError(path)
|
67 |
+
r.raise_for_status()
|
68 |
+
out = r.json()
|
69 |
+
if out["format"] == "text":
|
70 |
+
# data should be binary
|
71 |
+
b = out["content"].encode()
|
72 |
+
else:
|
73 |
+
b = base64.b64decode(out["content"])
|
74 |
+
return b[start:end]
|
75 |
+
|
76 |
+
def pipe_file(self, path, value, **_):
|
77 |
+
path = self._strip_protocol(path)
|
78 |
+
json = {
|
79 |
+
"name": path.rsplit("/", 1)[-1],
|
80 |
+
"path": path,
|
81 |
+
"size": len(value),
|
82 |
+
"content": base64.b64encode(value).decode(),
|
83 |
+
"format": "base64",
|
84 |
+
"type": "file",
|
85 |
+
}
|
86 |
+
self.session.put(f"{self.url}/{path}", json=json)
|
87 |
+
|
88 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
89 |
+
path = self._strip_protocol(path)
|
90 |
+
if create_parents and "/" in path:
|
91 |
+
self.mkdir(path.rsplit("/", 1)[0], True)
|
92 |
+
json = {
|
93 |
+
"name": path.rsplit("/", 1)[-1],
|
94 |
+
"path": path,
|
95 |
+
"size": None,
|
96 |
+
"content": None,
|
97 |
+
"type": "directory",
|
98 |
+
}
|
99 |
+
self.session.put(f"{self.url}/{path}", json=json)
|
100 |
+
|
101 |
+
def _rm(self, path):
|
102 |
+
path = self._strip_protocol(path)
|
103 |
+
self.session.delete(f"{self.url}/{path}")
|
104 |
+
|
105 |
+
def _open(self, path, mode="rb", **kwargs):
|
106 |
+
path = self._strip_protocol(path)
|
107 |
+
if mode == "rb":
|
108 |
+
data = self.cat_file(path)
|
109 |
+
return io.BytesIO(data)
|
110 |
+
else:
|
111 |
+
return SimpleFileWriter(self, path, mode="wb")
|
112 |
+
|
113 |
+
|
114 |
+
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
|
115 |
+
def _upload_chunk(self, final=False):
|
116 |
+
"""Never uploads a chunk until file is done
|
117 |
+
|
118 |
+
Not suitable for large files
|
119 |
+
"""
|
120 |
+
if final is False:
|
121 |
+
return False
|
122 |
+
self.buffer.seek(0)
|
123 |
+
data = self.buffer.read()
|
124 |
+
self.fs.pipe_file(self.path, data)
|
lib/python3.11/site-packages/fsspec/implementations/libarchive.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import contextmanager
|
2 |
+
from ctypes import (
|
3 |
+
CFUNCTYPE,
|
4 |
+
POINTER,
|
5 |
+
c_int,
|
6 |
+
c_longlong,
|
7 |
+
c_void_p,
|
8 |
+
cast,
|
9 |
+
create_string_buffer,
|
10 |
+
)
|
11 |
+
|
12 |
+
import libarchive
|
13 |
+
import libarchive.ffi as ffi
|
14 |
+
|
15 |
+
from fsspec import open_files
|
16 |
+
from fsspec.archive import AbstractArchiveFileSystem
|
17 |
+
from fsspec.implementations.memory import MemoryFile
|
18 |
+
from fsspec.utils import DEFAULT_BLOCK_SIZE
|
19 |
+
|
20 |
+
# Libarchive requires seekable files or memory only for certain archive
|
21 |
+
# types. However, since we read the directory first to cache the contents
|
22 |
+
# and also allow random access to any file, the file-like object needs
|
23 |
+
# to be seekable no matter what.
|
24 |
+
|
25 |
+
# Seek call-backs (not provided in the libarchive python wrapper)
|
26 |
+
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
|
27 |
+
read_set_seek_callback = ffi.ffi(
|
28 |
+
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
|
29 |
+
)
|
30 |
+
new_api = hasattr(ffi, "NO_OPEN_CB")
|
31 |
+
|
32 |
+
|
33 |
+
@contextmanager
|
34 |
+
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
|
35 |
+
"""Read an archive from a seekable file-like object.
|
36 |
+
|
37 |
+
The `file` object must support the standard `readinto` and 'seek' methods.
|
38 |
+
"""
|
39 |
+
buf = create_string_buffer(block_size)
|
40 |
+
buf_p = cast(buf, c_void_p)
|
41 |
+
|
42 |
+
def read_func(archive_p, context, ptrptr):
|
43 |
+
# readinto the buffer, returns number of bytes read
|
44 |
+
length = file.readinto(buf)
|
45 |
+
# write the address of the buffer into the pointer
|
46 |
+
ptrptr = cast(ptrptr, POINTER(c_void_p))
|
47 |
+
ptrptr[0] = buf_p
|
48 |
+
# tell libarchive how much data was written into the buffer
|
49 |
+
return length
|
50 |
+
|
51 |
+
def seek_func(archive_p, context, offset, whence):
|
52 |
+
file.seek(offset, whence)
|
53 |
+
# tell libarchvie the current position
|
54 |
+
return file.tell()
|
55 |
+
|
56 |
+
read_cb = ffi.READ_CALLBACK(read_func)
|
57 |
+
seek_cb = SEEK_CALLBACK(seek_func)
|
58 |
+
|
59 |
+
if new_api:
|
60 |
+
open_cb = ffi.NO_OPEN_CB
|
61 |
+
close_cb = ffi.NO_CLOSE_CB
|
62 |
+
else:
|
63 |
+
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
|
64 |
+
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
|
65 |
+
|
66 |
+
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
|
67 |
+
read_set_seek_callback(archive_p, seek_cb)
|
68 |
+
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
|
69 |
+
yield libarchive.read.ArchiveRead(archive_p)
|
70 |
+
|
71 |
+
|
72 |
+
class LibArchiveFileSystem(AbstractArchiveFileSystem):
|
73 |
+
"""Compressed archives as a file-system (read-only)
|
74 |
+
|
75 |
+
Supports the following formats:
|
76 |
+
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
|
77 |
+
Microsoft CAB, 7-Zip, WARC
|
78 |
+
|
79 |
+
See the libarchive documentation for further restrictions.
|
80 |
+
https://www.libarchive.org/
|
81 |
+
|
82 |
+
Keeps file object open while instance lives. It only works in seekable
|
83 |
+
file-like objects. In case the filesystem does not support this kind of
|
84 |
+
file object, it is recommended to cache locally.
|
85 |
+
|
86 |
+
This class is pickleable, but not necessarily thread-safe (depends on the
|
87 |
+
platform). See libarchive documentation for details.
|
88 |
+
"""
|
89 |
+
|
90 |
+
root_marker = ""
|
91 |
+
protocol = "libarchive"
|
92 |
+
cachable = False
|
93 |
+
|
94 |
+
def __init__(
|
95 |
+
self,
|
96 |
+
fo="",
|
97 |
+
mode="r",
|
98 |
+
target_protocol=None,
|
99 |
+
target_options=None,
|
100 |
+
block_size=DEFAULT_BLOCK_SIZE,
|
101 |
+
**kwargs,
|
102 |
+
):
|
103 |
+
"""
|
104 |
+
Parameters
|
105 |
+
----------
|
106 |
+
fo: str or file-like
|
107 |
+
Contains ZIP, and must exist. If a str, will fetch file using
|
108 |
+
:meth:`~fsspec.open_files`, which must return one file exactly.
|
109 |
+
mode: str
|
110 |
+
Currently, only 'r' accepted
|
111 |
+
target_protocol: str (optional)
|
112 |
+
If ``fo`` is a string, this value can be used to override the
|
113 |
+
FS protocol inferred from a URL
|
114 |
+
target_options: dict (optional)
|
115 |
+
Kwargs passed when instantiating the target FS, if ``fo`` is
|
116 |
+
a string.
|
117 |
+
"""
|
118 |
+
super().__init__(self, **kwargs)
|
119 |
+
if mode != "r":
|
120 |
+
raise ValueError("Only read from archive files accepted")
|
121 |
+
if isinstance(fo, str):
|
122 |
+
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
|
123 |
+
if len(files) != 1:
|
124 |
+
raise ValueError(
|
125 |
+
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
|
126 |
+
)
|
127 |
+
fo = files[0]
|
128 |
+
self.of = fo
|
129 |
+
self.fo = fo.__enter__() # the whole instance is a context
|
130 |
+
self.block_size = block_size
|
131 |
+
self.dir_cache = None
|
132 |
+
|
133 |
+
@contextmanager
|
134 |
+
def _open_archive(self):
|
135 |
+
self.fo.seek(0)
|
136 |
+
with custom_reader(self.fo, block_size=self.block_size) as arc:
|
137 |
+
yield arc
|
138 |
+
|
139 |
+
@classmethod
|
140 |
+
def _strip_protocol(cls, path):
|
141 |
+
# file paths are always relative to the archive root
|
142 |
+
return super()._strip_protocol(path).lstrip("/")
|
143 |
+
|
144 |
+
def _get_dirs(self):
|
145 |
+
fields = {
|
146 |
+
"name": "pathname",
|
147 |
+
"size": "size",
|
148 |
+
"created": "ctime",
|
149 |
+
"mode": "mode",
|
150 |
+
"uid": "uid",
|
151 |
+
"gid": "gid",
|
152 |
+
"mtime": "mtime",
|
153 |
+
}
|
154 |
+
|
155 |
+
if self.dir_cache is not None:
|
156 |
+
return
|
157 |
+
|
158 |
+
self.dir_cache = {}
|
159 |
+
list_names = []
|
160 |
+
with self._open_archive() as arc:
|
161 |
+
for entry in arc:
|
162 |
+
if not entry.isdir and not entry.isfile:
|
163 |
+
# Skip symbolic links, fifo entries, etc.
|
164 |
+
continue
|
165 |
+
self.dir_cache.update(
|
166 |
+
{
|
167 |
+
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
168 |
+
for dirname in self._all_dirnames(set(entry.name))
|
169 |
+
}
|
170 |
+
)
|
171 |
+
f = {key: getattr(entry, fields[key]) for key in fields}
|
172 |
+
f["type"] = "directory" if entry.isdir else "file"
|
173 |
+
list_names.append(entry.name)
|
174 |
+
|
175 |
+
self.dir_cache[f["name"]] = f
|
176 |
+
# libarchive does not seem to return an entry for the directories (at least
|
177 |
+
# not in all formats), so get the directories names from the files names
|
178 |
+
self.dir_cache.update(
|
179 |
+
{
|
180 |
+
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
181 |
+
for dirname in self._all_dirnames(list_names)
|
182 |
+
}
|
183 |
+
)
|
184 |
+
|
185 |
+
def _open(
|
186 |
+
self,
|
187 |
+
path,
|
188 |
+
mode="rb",
|
189 |
+
block_size=None,
|
190 |
+
autocommit=True,
|
191 |
+
cache_options=None,
|
192 |
+
**kwargs,
|
193 |
+
):
|
194 |
+
path = self._strip_protocol(path)
|
195 |
+
if mode != "rb":
|
196 |
+
raise NotImplementedError
|
197 |
+
|
198 |
+
data = bytes()
|
199 |
+
with self._open_archive() as arc:
|
200 |
+
for entry in arc:
|
201 |
+
if entry.pathname != path:
|
202 |
+
continue
|
203 |
+
|
204 |
+
if entry.size == 0:
|
205 |
+
# empty file, so there are no blocks
|
206 |
+
break
|
207 |
+
|
208 |
+
for block in entry.get_blocks(entry.size):
|
209 |
+
data = block
|
210 |
+
break
|
211 |
+
else:
|
212 |
+
raise ValueError
|
213 |
+
return MemoryFile(fs=self, path=path, data=data)
|
lib/python3.11/site-packages/fsspec/implementations/local.py
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import io
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import os.path as osp
|
6 |
+
import posixpath
|
7 |
+
import re
|
8 |
+
import shutil
|
9 |
+
import stat
|
10 |
+
import tempfile
|
11 |
+
|
12 |
+
from fsspec import AbstractFileSystem
|
13 |
+
from fsspec.compression import compr
|
14 |
+
from fsspec.core import get_compression
|
15 |
+
from fsspec.utils import isfilelike, stringify_path
|
16 |
+
|
17 |
+
logger = logging.getLogger("fsspec.local")
|
18 |
+
|
19 |
+
|
20 |
+
class LocalFileSystem(AbstractFileSystem):
|
21 |
+
"""Interface to files on local storage
|
22 |
+
|
23 |
+
Parameters
|
24 |
+
----------
|
25 |
+
auto_mkdir: bool
|
26 |
+
Whether, when opening a file, the directory containing it should
|
27 |
+
be created (if it doesn't already exist). This is assumed by pyarrow
|
28 |
+
code.
|
29 |
+
"""
|
30 |
+
|
31 |
+
root_marker = "/"
|
32 |
+
protocol = "file", "local"
|
33 |
+
local_file = True
|
34 |
+
|
35 |
+
def __init__(self, auto_mkdir=False, **kwargs):
|
36 |
+
super().__init__(**kwargs)
|
37 |
+
self.auto_mkdir = auto_mkdir
|
38 |
+
|
39 |
+
@property
|
40 |
+
def fsid(self):
|
41 |
+
return "local"
|
42 |
+
|
43 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
44 |
+
path = self._strip_protocol(path)
|
45 |
+
if self.exists(path):
|
46 |
+
raise FileExistsError(path)
|
47 |
+
if create_parents:
|
48 |
+
self.makedirs(path, exist_ok=True)
|
49 |
+
else:
|
50 |
+
os.mkdir(path, **kwargs)
|
51 |
+
|
52 |
+
def makedirs(self, path, exist_ok=False):
|
53 |
+
path = self._strip_protocol(path)
|
54 |
+
os.makedirs(path, exist_ok=exist_ok)
|
55 |
+
|
56 |
+
def rmdir(self, path):
|
57 |
+
path = self._strip_protocol(path)
|
58 |
+
os.rmdir(path)
|
59 |
+
|
60 |
+
def ls(self, path, detail=False, **kwargs):
|
61 |
+
path = self._strip_protocol(path)
|
62 |
+
if detail:
|
63 |
+
with os.scandir(path) as it:
|
64 |
+
return [self.info(f) for f in it]
|
65 |
+
else:
|
66 |
+
return [posixpath.join(path, f) for f in os.listdir(path)]
|
67 |
+
|
68 |
+
def info(self, path, **kwargs):
|
69 |
+
if isinstance(path, os.DirEntry):
|
70 |
+
# scandir DirEntry
|
71 |
+
out = path.stat(follow_symlinks=False)
|
72 |
+
link = path.is_symlink()
|
73 |
+
if path.is_dir(follow_symlinks=False):
|
74 |
+
t = "directory"
|
75 |
+
elif path.is_file(follow_symlinks=False):
|
76 |
+
t = "file"
|
77 |
+
else:
|
78 |
+
t = "other"
|
79 |
+
path = self._strip_protocol(path.path)
|
80 |
+
else:
|
81 |
+
# str or path-like
|
82 |
+
path = self._strip_protocol(path)
|
83 |
+
out = os.stat(path, follow_symlinks=False)
|
84 |
+
link = stat.S_ISLNK(out.st_mode)
|
85 |
+
if link:
|
86 |
+
out = os.stat(path, follow_symlinks=True)
|
87 |
+
if stat.S_ISDIR(out.st_mode):
|
88 |
+
t = "directory"
|
89 |
+
elif stat.S_ISREG(out.st_mode):
|
90 |
+
t = "file"
|
91 |
+
else:
|
92 |
+
t = "other"
|
93 |
+
result = {
|
94 |
+
"name": path,
|
95 |
+
"size": out.st_size,
|
96 |
+
"type": t,
|
97 |
+
"created": out.st_ctime,
|
98 |
+
"islink": link,
|
99 |
+
}
|
100 |
+
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
101 |
+
result[field] = getattr(out, f"st_{field}")
|
102 |
+
if result["islink"]:
|
103 |
+
result["destination"] = os.readlink(path)
|
104 |
+
try:
|
105 |
+
out2 = os.stat(path, follow_symlinks=True)
|
106 |
+
result["size"] = out2.st_size
|
107 |
+
except OSError:
|
108 |
+
result["size"] = 0
|
109 |
+
return result
|
110 |
+
|
111 |
+
def lexists(self, path, **kwargs):
|
112 |
+
return osp.lexists(path)
|
113 |
+
|
114 |
+
def cp_file(self, path1, path2, **kwargs):
|
115 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
116 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
117 |
+
if self.auto_mkdir:
|
118 |
+
self.makedirs(self._parent(path2), exist_ok=True)
|
119 |
+
if self.isfile(path1):
|
120 |
+
shutil.copyfile(path1, path2)
|
121 |
+
elif self.isdir(path1):
|
122 |
+
self.mkdirs(path2, exist_ok=True)
|
123 |
+
else:
|
124 |
+
raise FileNotFoundError(path1)
|
125 |
+
|
126 |
+
def get_file(self, path1, path2, callback=None, **kwargs):
|
127 |
+
if isfilelike(path2):
|
128 |
+
with open(path1, "rb") as f:
|
129 |
+
shutil.copyfileobj(f, path2)
|
130 |
+
else:
|
131 |
+
return self.cp_file(path1, path2, **kwargs)
|
132 |
+
|
133 |
+
def put_file(self, path1, path2, callback=None, **kwargs):
|
134 |
+
return self.cp_file(path1, path2, **kwargs)
|
135 |
+
|
136 |
+
def mv_file(self, path1, path2, **kwargs):
|
137 |
+
path1 = self._strip_protocol(path1).rstrip("/")
|
138 |
+
path2 = self._strip_protocol(path2).rstrip("/")
|
139 |
+
shutil.move(path1, path2)
|
140 |
+
|
141 |
+
def link(self, src, dst, **kwargs):
|
142 |
+
src = self._strip_protocol(src)
|
143 |
+
dst = self._strip_protocol(dst)
|
144 |
+
os.link(src, dst, **kwargs)
|
145 |
+
|
146 |
+
def symlink(self, src, dst, **kwargs):
|
147 |
+
src = self._strip_protocol(src)
|
148 |
+
dst = self._strip_protocol(dst)
|
149 |
+
os.symlink(src, dst, **kwargs)
|
150 |
+
|
151 |
+
def islink(self, path) -> bool:
|
152 |
+
return os.path.islink(self._strip_protocol(path))
|
153 |
+
|
154 |
+
def rm_file(self, path):
|
155 |
+
os.remove(self._strip_protocol(path))
|
156 |
+
|
157 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
158 |
+
if not isinstance(path, list):
|
159 |
+
path = [path]
|
160 |
+
|
161 |
+
for p in path:
|
162 |
+
p = self._strip_protocol(p).rstrip("/")
|
163 |
+
if self.isdir(p):
|
164 |
+
if not recursive:
|
165 |
+
raise ValueError("Cannot delete directory, set recursive=True")
|
166 |
+
if osp.abspath(p) == os.getcwd():
|
167 |
+
raise ValueError("Cannot delete current working directory")
|
168 |
+
shutil.rmtree(p)
|
169 |
+
else:
|
170 |
+
os.remove(p)
|
171 |
+
|
172 |
+
def unstrip_protocol(self, name):
|
173 |
+
name = self._strip_protocol(name) # normalise for local/win/...
|
174 |
+
return f"file://{name}"
|
175 |
+
|
176 |
+
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
177 |
+
path = self._strip_protocol(path)
|
178 |
+
if self.auto_mkdir and "w" in mode:
|
179 |
+
self.makedirs(self._parent(path), exist_ok=True)
|
180 |
+
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
181 |
+
|
182 |
+
def touch(self, path, truncate=True, **kwargs):
|
183 |
+
path = self._strip_protocol(path)
|
184 |
+
if self.auto_mkdir:
|
185 |
+
self.makedirs(self._parent(path), exist_ok=True)
|
186 |
+
if self.exists(path):
|
187 |
+
os.utime(path, None)
|
188 |
+
else:
|
189 |
+
open(path, "a").close()
|
190 |
+
if truncate:
|
191 |
+
os.truncate(path, 0)
|
192 |
+
|
193 |
+
def created(self, path):
|
194 |
+
info = self.info(path=path)
|
195 |
+
return datetime.datetime.fromtimestamp(
|
196 |
+
info["created"], tz=datetime.timezone.utc
|
197 |
+
)
|
198 |
+
|
199 |
+
def modified(self, path):
|
200 |
+
info = self.info(path=path)
|
201 |
+
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
202 |
+
|
203 |
+
@classmethod
|
204 |
+
def _parent(cls, path):
|
205 |
+
path = cls._strip_protocol(path).rstrip("/")
|
206 |
+
if "/" in path:
|
207 |
+
return path.rsplit("/", 1)[0]
|
208 |
+
else:
|
209 |
+
return cls.root_marker
|
210 |
+
|
211 |
+
@classmethod
|
212 |
+
def _strip_protocol(cls, path):
|
213 |
+
path = stringify_path(path)
|
214 |
+
if path.startswith("file://"):
|
215 |
+
path = path[7:]
|
216 |
+
elif path.startswith("file:"):
|
217 |
+
path = path[5:]
|
218 |
+
elif path.startswith("local://"):
|
219 |
+
path = path[8:]
|
220 |
+
elif path.startswith("local:"):
|
221 |
+
path = path[6:]
|
222 |
+
return make_path_posix(path).rstrip("/") or cls.root_marker
|
223 |
+
|
224 |
+
def _isfilestore(self):
|
225 |
+
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
226 |
+
# the original motivation. But we are a posix-like file system.
|
227 |
+
# See https://github.com/dask/dask/issues/5526
|
228 |
+
return True
|
229 |
+
|
230 |
+
def chmod(self, path, mode):
|
231 |
+
path = stringify_path(path)
|
232 |
+
return os.chmod(path, mode)
|
233 |
+
|
234 |
+
|
235 |
+
def make_path_posix(path, sep=os.sep):
|
236 |
+
"""Make path generic"""
|
237 |
+
if isinstance(path, (list, set, tuple)):
|
238 |
+
return type(path)(make_path_posix(p) for p in path)
|
239 |
+
if "~" in path:
|
240 |
+
path = osp.expanduser(path)
|
241 |
+
if sep == "/":
|
242 |
+
# most common fast case for posix
|
243 |
+
if path.startswith("/"):
|
244 |
+
return path
|
245 |
+
if path.startswith("./"):
|
246 |
+
path = path[2:]
|
247 |
+
return f"{os.getcwd()}/{path}"
|
248 |
+
if (
|
249 |
+
(sep not in path and "/" not in path)
|
250 |
+
or (sep == "/" and not path.startswith("/"))
|
251 |
+
or (sep == "\\" and ":" not in path and not path.startswith("\\\\"))
|
252 |
+
):
|
253 |
+
# relative path like "path" or "rel\\path" (win) or rel/path"
|
254 |
+
if os.sep == "\\":
|
255 |
+
# abspath made some more '\\' separators
|
256 |
+
return make_path_posix(osp.abspath(path))
|
257 |
+
else:
|
258 |
+
return f"{os.getcwd()}/{path}"
|
259 |
+
if path.startswith("file://"):
|
260 |
+
path = path[7:]
|
261 |
+
if re.match("/[A-Za-z]:", path):
|
262 |
+
# for windows file URI like "file:///C:/folder/file"
|
263 |
+
# or "file:///C:\\dir\\file"
|
264 |
+
path = path[1:].replace("\\", "/").replace("//", "/")
|
265 |
+
if path.startswith("\\\\"):
|
266 |
+
# special case for windows UNC/DFS-style paths, do nothing,
|
267 |
+
# just flip the slashes around (case below does not work!)
|
268 |
+
return path.replace("\\", "/")
|
269 |
+
if re.match("[A-Za-z]:", path):
|
270 |
+
# windows full path like "C:\\local\\path"
|
271 |
+
return path.lstrip("\\").replace("\\", "/").replace("//", "/")
|
272 |
+
if path.startswith("\\"):
|
273 |
+
# windows network path like "\\server\\path"
|
274 |
+
return "/" + path.lstrip("\\").replace("\\", "/").replace("//", "/")
|
275 |
+
return path
|
276 |
+
|
277 |
+
|
278 |
+
def trailing_sep(path):
|
279 |
+
"""Return True if the path ends with a path separator.
|
280 |
+
|
281 |
+
A forward slash is always considered a path separator, even on Operating
|
282 |
+
Systems that normally use a backslash.
|
283 |
+
"""
|
284 |
+
# TODO: if all incoming paths were posix-compliant then separator would
|
285 |
+
# always be a forward slash, simplifying this function.
|
286 |
+
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
287 |
+
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
288 |
+
|
289 |
+
|
290 |
+
class LocalFileOpener(io.IOBase):
|
291 |
+
def __init__(
|
292 |
+
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
293 |
+
):
|
294 |
+
logger.debug("open file: %s", path)
|
295 |
+
self.path = path
|
296 |
+
self.mode = mode
|
297 |
+
self.fs = fs
|
298 |
+
self.f = None
|
299 |
+
self.autocommit = autocommit
|
300 |
+
self.compression = get_compression(path, compression)
|
301 |
+
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
302 |
+
self._open()
|
303 |
+
|
304 |
+
def _open(self):
|
305 |
+
if self.f is None or self.f.closed:
|
306 |
+
if self.autocommit or "w" not in self.mode:
|
307 |
+
self.f = open(self.path, mode=self.mode)
|
308 |
+
if self.compression:
|
309 |
+
compress = compr[self.compression]
|
310 |
+
self.f = compress(self.f, mode=self.mode)
|
311 |
+
else:
|
312 |
+
# TODO: check if path is writable?
|
313 |
+
i, name = tempfile.mkstemp()
|
314 |
+
os.close(i) # we want normal open and normal buffered file
|
315 |
+
self.temp = name
|
316 |
+
self.f = open(name, mode=self.mode)
|
317 |
+
if "w" not in self.mode:
|
318 |
+
self.size = self.f.seek(0, 2)
|
319 |
+
self.f.seek(0)
|
320 |
+
self.f.size = self.size
|
321 |
+
|
322 |
+
def _fetch_range(self, start, end):
|
323 |
+
# probably only used by cached FS
|
324 |
+
if "r" not in self.mode:
|
325 |
+
raise ValueError
|
326 |
+
self._open()
|
327 |
+
self.f.seek(start)
|
328 |
+
return self.f.read(end - start)
|
329 |
+
|
330 |
+
def __setstate__(self, state):
|
331 |
+
self.f = None
|
332 |
+
loc = state.pop("loc", None)
|
333 |
+
self.__dict__.update(state)
|
334 |
+
if "r" in state["mode"]:
|
335 |
+
self.f = None
|
336 |
+
self._open()
|
337 |
+
self.f.seek(loc)
|
338 |
+
|
339 |
+
def __getstate__(self):
|
340 |
+
d = self.__dict__.copy()
|
341 |
+
d.pop("f")
|
342 |
+
if "r" in self.mode:
|
343 |
+
d["loc"] = self.f.tell()
|
344 |
+
else:
|
345 |
+
if not self.f.closed:
|
346 |
+
raise ValueError("Cannot serialise open write-mode local file")
|
347 |
+
return d
|
348 |
+
|
349 |
+
def commit(self):
|
350 |
+
if self.autocommit:
|
351 |
+
raise RuntimeError("Can only commit if not already set to autocommit")
|
352 |
+
shutil.move(self.temp, self.path)
|
353 |
+
|
354 |
+
def discard(self):
|
355 |
+
if self.autocommit:
|
356 |
+
raise RuntimeError("Cannot discard if set to autocommit")
|
357 |
+
os.remove(self.temp)
|
358 |
+
|
359 |
+
def readable(self) -> bool:
|
360 |
+
return True
|
361 |
+
|
362 |
+
def writable(self) -> bool:
|
363 |
+
return "r" not in self.mode
|
364 |
+
|
365 |
+
def read(self, *args, **kwargs):
|
366 |
+
return self.f.read(*args, **kwargs)
|
367 |
+
|
368 |
+
def write(self, *args, **kwargs):
|
369 |
+
return self.f.write(*args, **kwargs)
|
370 |
+
|
371 |
+
def tell(self, *args, **kwargs):
|
372 |
+
return self.f.tell(*args, **kwargs)
|
373 |
+
|
374 |
+
def seek(self, *args, **kwargs):
|
375 |
+
return self.f.seek(*args, **kwargs)
|
376 |
+
|
377 |
+
def seekable(self, *args, **kwargs):
|
378 |
+
return self.f.seekable(*args, **kwargs)
|
379 |
+
|
380 |
+
def readline(self, *args, **kwargs):
|
381 |
+
return self.f.readline(*args, **kwargs)
|
382 |
+
|
383 |
+
def readlines(self, *args, **kwargs):
|
384 |
+
return self.f.readlines(*args, **kwargs)
|
385 |
+
|
386 |
+
def close(self):
|
387 |
+
return self.f.close()
|
388 |
+
|
389 |
+
def truncate(self, size=None) -> int:
|
390 |
+
return self.f.truncate(size)
|
391 |
+
|
392 |
+
@property
|
393 |
+
def closed(self):
|
394 |
+
return self.f.closed
|
395 |
+
|
396 |
+
def fileno(self):
|
397 |
+
return self.raw.fileno()
|
398 |
+
|
399 |
+
def flush(self) -> None:
|
400 |
+
self.f.flush()
|
401 |
+
|
402 |
+
def __iter__(self):
|
403 |
+
return self.f.__iter__()
|
404 |
+
|
405 |
+
def __getattr__(self, item):
|
406 |
+
return getattr(self.f, item)
|
407 |
+
|
408 |
+
def __enter__(self):
|
409 |
+
self._incontext = True
|
410 |
+
return self
|
411 |
+
|
412 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
413 |
+
self._incontext = False
|
414 |
+
self.f.__exit__(exc_type, exc_value, traceback)
|
lib/python3.11/site-packages/fsspec/implementations/memory.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from datetime import datetime, timezone
|
5 |
+
from errno import ENOTEMPTY
|
6 |
+
from io import BytesIO
|
7 |
+
from typing import Any, ClassVar
|
8 |
+
|
9 |
+
from fsspec import AbstractFileSystem
|
10 |
+
|
11 |
+
logger = logging.Logger("fsspec.memoryfs")
|
12 |
+
|
13 |
+
|
14 |
+
class MemoryFileSystem(AbstractFileSystem):
|
15 |
+
"""A filesystem based on a dict of BytesIO objects
|
16 |
+
|
17 |
+
This is a global filesystem so instances of this class all point to the same
|
18 |
+
in memory filesystem.
|
19 |
+
"""
|
20 |
+
|
21 |
+
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
|
22 |
+
pseudo_dirs = [""] # global, do not overwrite!
|
23 |
+
protocol = "memory"
|
24 |
+
root_marker = "/"
|
25 |
+
|
26 |
+
@classmethod
|
27 |
+
def _strip_protocol(cls, path):
|
28 |
+
if path.startswith("memory://"):
|
29 |
+
path = path[len("memory://") :]
|
30 |
+
if "::" in path or "://" in path:
|
31 |
+
return path.rstrip("/")
|
32 |
+
path = path.lstrip("/").rstrip("/")
|
33 |
+
return "/" + path if path else ""
|
34 |
+
|
35 |
+
def ls(self, path, detail=True, **kwargs):
|
36 |
+
path = self._strip_protocol(path)
|
37 |
+
if path in self.store:
|
38 |
+
# there is a key with this exact name
|
39 |
+
if not detail:
|
40 |
+
return [path]
|
41 |
+
return [
|
42 |
+
{
|
43 |
+
"name": path,
|
44 |
+
"size": self.store[path].size,
|
45 |
+
"type": "file",
|
46 |
+
"created": self.store[path].created.timestamp(),
|
47 |
+
}
|
48 |
+
]
|
49 |
+
paths = set()
|
50 |
+
starter = path + "/"
|
51 |
+
out = []
|
52 |
+
for p2 in tuple(self.store):
|
53 |
+
if p2.startswith(starter):
|
54 |
+
if "/" not in p2[len(starter) :]:
|
55 |
+
# exact child
|
56 |
+
out.append(
|
57 |
+
{
|
58 |
+
"name": p2,
|
59 |
+
"size": self.store[p2].size,
|
60 |
+
"type": "file",
|
61 |
+
"created": self.store[p2].created.timestamp(),
|
62 |
+
}
|
63 |
+
)
|
64 |
+
elif len(p2) > len(starter):
|
65 |
+
# implied child directory
|
66 |
+
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
67 |
+
if ppath not in paths:
|
68 |
+
out = out or []
|
69 |
+
out.append(
|
70 |
+
{
|
71 |
+
"name": ppath,
|
72 |
+
"size": 0,
|
73 |
+
"type": "directory",
|
74 |
+
}
|
75 |
+
)
|
76 |
+
paths.add(ppath)
|
77 |
+
for p2 in self.pseudo_dirs:
|
78 |
+
if p2.startswith(starter):
|
79 |
+
if "/" not in p2[len(starter) :]:
|
80 |
+
# exact child pdir
|
81 |
+
if p2 not in paths:
|
82 |
+
out.append({"name": p2, "size": 0, "type": "directory"})
|
83 |
+
paths.add(p2)
|
84 |
+
else:
|
85 |
+
# directory implied by deeper pdir
|
86 |
+
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
87 |
+
if ppath not in paths:
|
88 |
+
out.append({"name": ppath, "size": 0, "type": "directory"})
|
89 |
+
paths.add(ppath)
|
90 |
+
if not out:
|
91 |
+
if path in self.pseudo_dirs:
|
92 |
+
# empty dir
|
93 |
+
return []
|
94 |
+
raise FileNotFoundError(path)
|
95 |
+
if detail:
|
96 |
+
return out
|
97 |
+
return sorted([f["name"] for f in out])
|
98 |
+
|
99 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
100 |
+
path = self._strip_protocol(path)
|
101 |
+
if path in self.store or path in self.pseudo_dirs:
|
102 |
+
raise FileExistsError(path)
|
103 |
+
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
|
104 |
+
raise NotADirectoryError(self._parent(path))
|
105 |
+
if create_parents and self._parent(path).strip("/"):
|
106 |
+
try:
|
107 |
+
self.mkdir(self._parent(path), create_parents, **kwargs)
|
108 |
+
except FileExistsError:
|
109 |
+
pass
|
110 |
+
if path and path not in self.pseudo_dirs:
|
111 |
+
self.pseudo_dirs.append(path)
|
112 |
+
|
113 |
+
def makedirs(self, path, exist_ok=False):
|
114 |
+
try:
|
115 |
+
self.mkdir(path, create_parents=True)
|
116 |
+
except FileExistsError:
|
117 |
+
if not exist_ok:
|
118 |
+
raise
|
119 |
+
|
120 |
+
def pipe_file(self, path, value, **kwargs):
|
121 |
+
"""Set the bytes of given file
|
122 |
+
|
123 |
+
Avoids copies of the data if possible
|
124 |
+
"""
|
125 |
+
self.open(path, "wb", data=value)
|
126 |
+
|
127 |
+
def rmdir(self, path):
|
128 |
+
path = self._strip_protocol(path)
|
129 |
+
if path == "":
|
130 |
+
# silently avoid deleting FS root
|
131 |
+
return
|
132 |
+
if path in self.pseudo_dirs:
|
133 |
+
if not self.ls(path):
|
134 |
+
self.pseudo_dirs.remove(path)
|
135 |
+
else:
|
136 |
+
raise OSError(ENOTEMPTY, "Directory not empty", path)
|
137 |
+
else:
|
138 |
+
raise FileNotFoundError(path)
|
139 |
+
|
140 |
+
def info(self, path, **kwargs):
|
141 |
+
path = self._strip_protocol(path)
|
142 |
+
if path in self.pseudo_dirs or any(
|
143 |
+
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
|
144 |
+
):
|
145 |
+
return {
|
146 |
+
"name": path,
|
147 |
+
"size": 0,
|
148 |
+
"type": "directory",
|
149 |
+
}
|
150 |
+
elif path in self.store:
|
151 |
+
filelike = self.store[path]
|
152 |
+
return {
|
153 |
+
"name": path,
|
154 |
+
"size": filelike.size,
|
155 |
+
"type": "file",
|
156 |
+
"created": getattr(filelike, "created", None),
|
157 |
+
}
|
158 |
+
else:
|
159 |
+
raise FileNotFoundError(path)
|
160 |
+
|
161 |
+
def _open(
|
162 |
+
self,
|
163 |
+
path,
|
164 |
+
mode="rb",
|
165 |
+
block_size=None,
|
166 |
+
autocommit=True,
|
167 |
+
cache_options=None,
|
168 |
+
**kwargs,
|
169 |
+
):
|
170 |
+
path = self._strip_protocol(path)
|
171 |
+
if path in self.pseudo_dirs:
|
172 |
+
raise IsADirectoryError(path)
|
173 |
+
parent = path
|
174 |
+
while len(parent) > 1:
|
175 |
+
parent = self._parent(parent)
|
176 |
+
if self.isfile(parent):
|
177 |
+
raise FileExistsError(parent)
|
178 |
+
if mode in ["rb", "ab", "r+b"]:
|
179 |
+
if path in self.store:
|
180 |
+
f = self.store[path]
|
181 |
+
if mode == "ab":
|
182 |
+
# position at the end of file
|
183 |
+
f.seek(0, 2)
|
184 |
+
else:
|
185 |
+
# position at the beginning of file
|
186 |
+
f.seek(0)
|
187 |
+
return f
|
188 |
+
else:
|
189 |
+
raise FileNotFoundError(path)
|
190 |
+
elif mode == "wb":
|
191 |
+
m = MemoryFile(self, path, kwargs.get("data"))
|
192 |
+
if not self._intrans:
|
193 |
+
m.commit()
|
194 |
+
return m
|
195 |
+
else:
|
196 |
+
name = self.__class__.__name__
|
197 |
+
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
|
198 |
+
|
199 |
+
def cp_file(self, path1, path2, **kwargs):
|
200 |
+
path1 = self._strip_protocol(path1)
|
201 |
+
path2 = self._strip_protocol(path2)
|
202 |
+
if self.isfile(path1):
|
203 |
+
self.store[path2] = MemoryFile(
|
204 |
+
self, path2, self.store[path1].getvalue()
|
205 |
+
) # implicit copy
|
206 |
+
elif self.isdir(path1):
|
207 |
+
if path2 not in self.pseudo_dirs:
|
208 |
+
self.pseudo_dirs.append(path2)
|
209 |
+
else:
|
210 |
+
raise FileNotFoundError(path1)
|
211 |
+
|
212 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
213 |
+
path = self._strip_protocol(path)
|
214 |
+
try:
|
215 |
+
return bytes(self.store[path].getbuffer()[start:end])
|
216 |
+
except KeyError:
|
217 |
+
raise FileNotFoundError(path)
|
218 |
+
|
219 |
+
def _rm(self, path):
|
220 |
+
path = self._strip_protocol(path)
|
221 |
+
try:
|
222 |
+
del self.store[path]
|
223 |
+
except KeyError as e:
|
224 |
+
raise FileNotFoundError(path) from e
|
225 |
+
|
226 |
+
def modified(self, path):
|
227 |
+
path = self._strip_protocol(path)
|
228 |
+
try:
|
229 |
+
return self.store[path].modified
|
230 |
+
except KeyError:
|
231 |
+
raise FileNotFoundError(path)
|
232 |
+
|
233 |
+
def created(self, path):
|
234 |
+
path = self._strip_protocol(path)
|
235 |
+
try:
|
236 |
+
return self.store[path].created
|
237 |
+
except KeyError:
|
238 |
+
raise FileNotFoundError(path)
|
239 |
+
|
240 |
+
def rm(self, path, recursive=False, maxdepth=None):
|
241 |
+
if isinstance(path, str):
|
242 |
+
path = self._strip_protocol(path)
|
243 |
+
else:
|
244 |
+
path = [self._strip_protocol(p) for p in path]
|
245 |
+
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
246 |
+
for p in reversed(paths):
|
247 |
+
# If the expanded path doesn't exist, it is only because the expanded
|
248 |
+
# path was a directory that does not exist in self.pseudo_dirs. This
|
249 |
+
# is possible if you directly create files without making the
|
250 |
+
# directories first.
|
251 |
+
if not self.exists(p):
|
252 |
+
continue
|
253 |
+
if self.isfile(p):
|
254 |
+
self.rm_file(p)
|
255 |
+
else:
|
256 |
+
self.rmdir(p)
|
257 |
+
|
258 |
+
|
259 |
+
class MemoryFile(BytesIO):
|
260 |
+
"""A BytesIO which can't close and works as a context manager
|
261 |
+
|
262 |
+
Can initialise with data. Each path should only be active once at any moment.
|
263 |
+
|
264 |
+
No need to provide fs, path if auto-committing (default)
|
265 |
+
"""
|
266 |
+
|
267 |
+
def __init__(self, fs=None, path=None, data=None):
|
268 |
+
logger.debug("open file %s", path)
|
269 |
+
self.fs = fs
|
270 |
+
self.path = path
|
271 |
+
self.created = datetime.now(tz=timezone.utc)
|
272 |
+
self.modified = datetime.now(tz=timezone.utc)
|
273 |
+
if data:
|
274 |
+
super().__init__(data)
|
275 |
+
self.seek(0)
|
276 |
+
|
277 |
+
@property
|
278 |
+
def size(self):
|
279 |
+
return self.getbuffer().nbytes
|
280 |
+
|
281 |
+
def __enter__(self):
|
282 |
+
return self
|
283 |
+
|
284 |
+
def close(self):
|
285 |
+
pass
|
286 |
+
|
287 |
+
def discard(self):
|
288 |
+
pass
|
289 |
+
|
290 |
+
def commit(self):
|
291 |
+
self.fs.store[self.path] = self
|
292 |
+
self.modified = datetime.now(tz=timezone.utc)
|
lib/python3.11/site-packages/fsspec/implementations/reference.py
ADDED
@@ -0,0 +1,1122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import collections
|
3 |
+
import io
|
4 |
+
import itertools
|
5 |
+
import logging
|
6 |
+
import math
|
7 |
+
import os
|
8 |
+
from functools import lru_cache
|
9 |
+
from typing import TYPE_CHECKING
|
10 |
+
|
11 |
+
import fsspec.core
|
12 |
+
|
13 |
+
try:
|
14 |
+
import ujson as json
|
15 |
+
except ImportError:
|
16 |
+
if not TYPE_CHECKING:
|
17 |
+
import json
|
18 |
+
|
19 |
+
from ..asyn import AsyncFileSystem
|
20 |
+
from ..callbacks import _DEFAULT_CALLBACK
|
21 |
+
from ..core import filesystem, open, split_protocol
|
22 |
+
from ..utils import isfilelike, merge_offset_ranges, other_paths
|
23 |
+
|
24 |
+
logger = logging.getLogger("fsspec.reference")
|
25 |
+
|
26 |
+
|
27 |
+
class ReferenceNotReachable(RuntimeError):
|
28 |
+
def __init__(self, reference, target, *args):
|
29 |
+
super().__init__(*args)
|
30 |
+
self.reference = reference
|
31 |
+
self.target = target
|
32 |
+
|
33 |
+
def __str__(self):
|
34 |
+
return f'Reference "{self.reference}" failed to fetch target {self.target}'
|
35 |
+
|
36 |
+
|
37 |
+
def _first(d):
|
38 |
+
return list(d.values())[0]
|
39 |
+
|
40 |
+
|
41 |
+
def _prot_in_references(path, references):
|
42 |
+
ref = references.get(path)
|
43 |
+
if isinstance(ref, (list, tuple)):
|
44 |
+
return split_protocol(ref[0])[0] if ref[0] else ref[0]
|
45 |
+
|
46 |
+
|
47 |
+
def _protocol_groups(paths, references):
|
48 |
+
if isinstance(paths, str):
|
49 |
+
return {_prot_in_references(paths, references): [paths]}
|
50 |
+
out = {}
|
51 |
+
for path in paths:
|
52 |
+
protocol = _prot_in_references(path, references)
|
53 |
+
out.setdefault(protocol, []).append(path)
|
54 |
+
return out
|
55 |
+
|
56 |
+
|
57 |
+
class RefsValuesView(collections.abc.ValuesView):
|
58 |
+
def __iter__(self):
|
59 |
+
for val in self._mapping.zmetadata.values():
|
60 |
+
yield json.dumps(val).encode()
|
61 |
+
yield from self._mapping._items.values()
|
62 |
+
for field in self._mapping.listdir():
|
63 |
+
chunk_sizes = self._mapping._get_chunk_sizes(field)
|
64 |
+
if len(chunk_sizes) == 0:
|
65 |
+
yield self._mapping[field + "/0"]
|
66 |
+
continue
|
67 |
+
yield from self._mapping._generate_all_records(field)
|
68 |
+
|
69 |
+
|
70 |
+
class RefsItemsView(collections.abc.ItemsView):
|
71 |
+
def __iter__(self):
|
72 |
+
return zip(self._mapping.keys(), self._mapping.values())
|
73 |
+
|
74 |
+
|
75 |
+
def ravel_multi_index(idx, sizes):
|
76 |
+
val = 0
|
77 |
+
mult = 1
|
78 |
+
for i, s in zip(idx[::-1], sizes[::-1]):
|
79 |
+
val += i * mult
|
80 |
+
mult *= s
|
81 |
+
return val
|
82 |
+
|
83 |
+
|
84 |
+
class LazyReferenceMapper(collections.abc.MutableMapping):
|
85 |
+
"""This interface can be used to read/write references from Parquet stores.
|
86 |
+
It is not intended for other types of references.
|
87 |
+
It can be used with Kerchunk's MultiZarrToZarr method to combine
|
88 |
+
references into a parquet store.
|
89 |
+
Examples of this use-case can be found here:
|
90 |
+
https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
|
91 |
+
|
92 |
+
# import is class level to prevent numpy dep requirement for fsspec
|
93 |
+
@property
|
94 |
+
def np(self):
|
95 |
+
import numpy as np
|
96 |
+
|
97 |
+
return np
|
98 |
+
|
99 |
+
@property
|
100 |
+
def pd(self):
|
101 |
+
import pandas as pd
|
102 |
+
|
103 |
+
return pd
|
104 |
+
|
105 |
+
def __init__(
|
106 |
+
self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
|
107 |
+
):
|
108 |
+
"""
|
109 |
+
Parameters
|
110 |
+
----------
|
111 |
+
root : str
|
112 |
+
Root of parquet store
|
113 |
+
fs : fsspec.AbstractFileSystem
|
114 |
+
fsspec filesystem object, default is local filesystem.
|
115 |
+
cache_size : int, default=128
|
116 |
+
Maximum size of LRU cache, where cache_size*record_size denotes
|
117 |
+
the total number of references that can be loaded in memory at once.
|
118 |
+
categorical_threshold : int
|
119 |
+
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
|
120 |
+
of the number of unique urls to total number of refs for each variable
|
121 |
+
is greater than or equal to this number. (default 10)
|
122 |
+
|
123 |
+
|
124 |
+
"""
|
125 |
+
self.root = root
|
126 |
+
self.chunk_sizes = {}
|
127 |
+
self._items = {}
|
128 |
+
self.dirs = None
|
129 |
+
self.fs = fsspec.filesystem("file") if fs is None else fs
|
130 |
+
self._items[".zmetadata"] = self.fs.cat_file(
|
131 |
+
"/".join([self.root, ".zmetadata"])
|
132 |
+
)
|
133 |
+
met = json.loads(self._items[".zmetadata"])
|
134 |
+
self.record_size = met["record_size"]
|
135 |
+
self.zmetadata = met["metadata"]
|
136 |
+
self.url = self.root + "/{field}/refs.{record}.parq"
|
137 |
+
self.out_root = out_root or self.root
|
138 |
+
self.cat_thresh = categorical_threshold
|
139 |
+
|
140 |
+
# Define function to open and decompress refs
|
141 |
+
@lru_cache(maxsize=cache_size)
|
142 |
+
def open_refs(field, record):
|
143 |
+
"""cached parquet file loader"""
|
144 |
+
path = self.url.format(field=field, record=record)
|
145 |
+
data = io.BytesIO(self.fs.cat_file(path))
|
146 |
+
df = self.pd.read_parquet(data, engine="fastparquet")
|
147 |
+
refs = {c: df[c].values for c in df.columns}
|
148 |
+
return refs
|
149 |
+
|
150 |
+
self.open_refs = open_refs
|
151 |
+
|
152 |
+
@staticmethod
|
153 |
+
def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
|
154 |
+
"""Make empty parquet reference set
|
155 |
+
|
156 |
+
Parameters
|
157 |
+
----------
|
158 |
+
root: str
|
159 |
+
Directory to contain the output; will be created
|
160 |
+
storage_options: dict | None
|
161 |
+
For making the filesystem to use for writing is fs is None
|
162 |
+
fs: FileSystem | None
|
163 |
+
Filesystem for writing
|
164 |
+
record_size: int
|
165 |
+
Number of references per parquet file
|
166 |
+
kwargs: passed to __init__
|
167 |
+
|
168 |
+
Returns
|
169 |
+
-------
|
170 |
+
LazyReferenceMapper instance
|
171 |
+
"""
|
172 |
+
met = {"metadata": {}, "record_size": record_size}
|
173 |
+
if fs is None:
|
174 |
+
fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
|
175 |
+
fs.makedirs(root, exist_ok=True)
|
176 |
+
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
177 |
+
return LazyReferenceMapper(root, fs, **kwargs)
|
178 |
+
|
179 |
+
def listdir(self, basename=True):
|
180 |
+
"""List top-level directories"""
|
181 |
+
if self.dirs is None:
|
182 |
+
dirs = [p.split("/", 1)[0] for p in self.zmetadata]
|
183 |
+
self.dirs = {p for p in dirs if p and not p.startswith(".")}
|
184 |
+
listing = self.dirs
|
185 |
+
if basename:
|
186 |
+
listing = [os.path.basename(path) for path in listing]
|
187 |
+
return listing
|
188 |
+
|
189 |
+
def ls(self, path="", detail=True):
|
190 |
+
"""Shortcut file listings"""
|
191 |
+
if not path:
|
192 |
+
dirnames = self.listdir()
|
193 |
+
others = set(
|
194 |
+
[".zmetadata"]
|
195 |
+
+ [name for name in self.zmetadata if "/" not in name]
|
196 |
+
+ [name for name in self._items if "/" not in name]
|
197 |
+
)
|
198 |
+
if detail is False:
|
199 |
+
others.update(dirnames)
|
200 |
+
return sorted(others)
|
201 |
+
dirinfo = [
|
202 |
+
{"name": name, "type": "directory", "size": 0} for name in dirnames
|
203 |
+
]
|
204 |
+
fileinfo = [
|
205 |
+
{
|
206 |
+
"name": name,
|
207 |
+
"type": "file",
|
208 |
+
"size": len(
|
209 |
+
json.dumps(self.zmetadata[name])
|
210 |
+
if name in self.zmetadata
|
211 |
+
else self._items[name]
|
212 |
+
),
|
213 |
+
}
|
214 |
+
for name in others
|
215 |
+
]
|
216 |
+
return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
|
217 |
+
parts = path.split("/", 1)
|
218 |
+
if len(parts) > 1:
|
219 |
+
raise FileNotFoundError("Cannot list within directories right now")
|
220 |
+
field = parts[0]
|
221 |
+
others = set(
|
222 |
+
[name for name in self.zmetadata if name.startswith(f"{path}/")]
|
223 |
+
+ [name for name in self._items if name.startswith(f"{path}/")]
|
224 |
+
)
|
225 |
+
fileinfo = [
|
226 |
+
{
|
227 |
+
"name": name,
|
228 |
+
"type": "file",
|
229 |
+
"size": len(
|
230 |
+
json.dumps(self.zmetadata[name])
|
231 |
+
if name in self.zmetadata
|
232 |
+
else self._items[name]
|
233 |
+
),
|
234 |
+
}
|
235 |
+
for name in others
|
236 |
+
]
|
237 |
+
keys = self._keys_in_field(field)
|
238 |
+
|
239 |
+
if detail is False:
|
240 |
+
return list(others) + list(keys)
|
241 |
+
recs = self._generate_all_records(field)
|
242 |
+
recinfo = [
|
243 |
+
{"name": name, "type": "file", "size": rec[-1]}
|
244 |
+
for name, rec in zip(keys, recs)
|
245 |
+
if rec[0] # filters out path==None, deleted/missing
|
246 |
+
]
|
247 |
+
return fileinfo + recinfo
|
248 |
+
|
249 |
+
def _load_one_key(self, key):
|
250 |
+
"""Get the reference for one key
|
251 |
+
|
252 |
+
Returns bytes, one-element list or three-element list.
|
253 |
+
"""
|
254 |
+
if key in self._items:
|
255 |
+
return self._items[key]
|
256 |
+
elif key in self.zmetadata:
|
257 |
+
return json.dumps(self.zmetadata[key]).encode()
|
258 |
+
elif "/" not in key or self._is_meta(key):
|
259 |
+
raise KeyError(key)
|
260 |
+
field, sub_key = key.split("/")
|
261 |
+
record, _, _ = self._key_to_record(key)
|
262 |
+
maybe = self._items.get((field, key), {}).get(sub_key, False)
|
263 |
+
if maybe is None:
|
264 |
+
# explicitly deleted
|
265 |
+
raise KeyError
|
266 |
+
elif maybe:
|
267 |
+
return maybe
|
268 |
+
|
269 |
+
# Chunk keys can be loaded from row group and cached in LRU cache
|
270 |
+
try:
|
271 |
+
record, ri, chunk_size = self._key_to_record(key)
|
272 |
+
if chunk_size == 0:
|
273 |
+
return b""
|
274 |
+
refs = self.open_refs(field, record)
|
275 |
+
except (ValueError, TypeError, FileNotFoundError):
|
276 |
+
raise KeyError(key)
|
277 |
+
columns = ["path", "offset", "size", "raw"]
|
278 |
+
selection = [refs[c][ri] if c in refs else None for c in columns]
|
279 |
+
raw = selection[-1]
|
280 |
+
if raw is not None:
|
281 |
+
return raw
|
282 |
+
if selection[0] is None:
|
283 |
+
raise KeyError("This reference has been deleted")
|
284 |
+
if selection[1:3] == [0, 0]:
|
285 |
+
# URL only
|
286 |
+
return selection[:1]
|
287 |
+
# URL, offset, size
|
288 |
+
return selection[:3]
|
289 |
+
|
290 |
+
@lru_cache(4096)
|
291 |
+
def _key_to_record(self, key):
|
292 |
+
"""Details needed to construct a reference for one key"""
|
293 |
+
field, chunk = key.split("/")
|
294 |
+
chunk_sizes = self._get_chunk_sizes(field)
|
295 |
+
if len(chunk_sizes) == 0:
|
296 |
+
return 0, 0, 0
|
297 |
+
chunk_idx = [int(c) for c in chunk.split(".")]
|
298 |
+
chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
|
299 |
+
record = chunk_number // self.record_size
|
300 |
+
ri = chunk_number % self.record_size
|
301 |
+
return record, ri, len(chunk_sizes)
|
302 |
+
|
303 |
+
def _get_chunk_sizes(self, field):
|
304 |
+
"""The number of chunks along each axis for a given field"""
|
305 |
+
if field not in self.chunk_sizes:
|
306 |
+
zarray = self.zmetadata[f"{field}/.zarray"]
|
307 |
+
size_ratio = [
|
308 |
+
math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
|
309 |
+
]
|
310 |
+
self.chunk_sizes[field] = size_ratio
|
311 |
+
return self.chunk_sizes[field]
|
312 |
+
|
313 |
+
def _generate_record(self, field, record):
|
314 |
+
"""The references for a given parquet file of a given field"""
|
315 |
+
refs = self.open_refs(field, record)
|
316 |
+
it = iter(zip(*refs.values()))
|
317 |
+
if len(refs) == 3:
|
318 |
+
# All urls
|
319 |
+
return (list(t) for t in it)
|
320 |
+
elif len(refs) == 1:
|
321 |
+
# All raws
|
322 |
+
return refs["raw"]
|
323 |
+
else:
|
324 |
+
# Mix of urls and raws
|
325 |
+
return (list(t[:3]) if not t[3] else t[3] for t in it)
|
326 |
+
|
327 |
+
def _generate_all_records(self, field):
|
328 |
+
"""Load all the references within a field by iterating over the parquet files"""
|
329 |
+
nrec = 1
|
330 |
+
for ch in self._get_chunk_sizes(field):
|
331 |
+
nrec *= ch
|
332 |
+
nrec = math.ceil(nrec / self.record_size)
|
333 |
+
for record in range(nrec):
|
334 |
+
yield from self._generate_record(field, record)
|
335 |
+
|
336 |
+
def values(self):
|
337 |
+
return RefsValuesView(self)
|
338 |
+
|
339 |
+
def items(self):
|
340 |
+
return RefsItemsView(self)
|
341 |
+
|
342 |
+
def __hash__(self):
|
343 |
+
return id(self)
|
344 |
+
|
345 |
+
@lru_cache(20)
|
346 |
+
def __getitem__(self, key):
|
347 |
+
return self._load_one_key(key)
|
348 |
+
|
349 |
+
def __setitem__(self, key, value):
|
350 |
+
if "/" in key and not self._is_meta(key):
|
351 |
+
field, chunk = key.split("/")
|
352 |
+
record, i, _ = self._key_to_record(key)
|
353 |
+
subdict = self._items.setdefault((field, record), {})
|
354 |
+
subdict[i] = value
|
355 |
+
if len(subdict) == self.record_size:
|
356 |
+
self.write(field, record)
|
357 |
+
else:
|
358 |
+
# metadata or top-level
|
359 |
+
self._items[key] = value
|
360 |
+
self.zmetadata[key] = json.loads(
|
361 |
+
value.decode() if isinstance(value, bytes) else value
|
362 |
+
)
|
363 |
+
|
364 |
+
@staticmethod
|
365 |
+
def _is_meta(key):
|
366 |
+
return key.startswith(".z") or "/.z" in key
|
367 |
+
|
368 |
+
def __delitem__(self, key):
|
369 |
+
if key in self._items:
|
370 |
+
del self._items[key]
|
371 |
+
elif key in self.zmetadata:
|
372 |
+
del self.zmetadata[key]
|
373 |
+
else:
|
374 |
+
if "/" in key and not self._is_meta(key):
|
375 |
+
field, chunk = key.split("/")
|
376 |
+
record, _, _ = self._key_to_record(key)
|
377 |
+
subdict = self._items.setdefault((field, record), {})
|
378 |
+
subdict[chunk] = None
|
379 |
+
if len(subdict) == self.record_size:
|
380 |
+
self.write(field, record)
|
381 |
+
else:
|
382 |
+
# metadata or top-level
|
383 |
+
self._items[key] = None
|
384 |
+
|
385 |
+
def write(self, field, record, base_url=None, storage_options=None):
|
386 |
+
# extra requirements if writing
|
387 |
+
import kerchunk.df
|
388 |
+
import numpy as np
|
389 |
+
import pandas as pd
|
390 |
+
|
391 |
+
# TODO: if the dict is incomplete, also load records and merge in
|
392 |
+
partition = self._items[(field, record)]
|
393 |
+
fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
|
394 |
+
|
395 |
+
####
|
396 |
+
paths = np.full(self.record_size, np.nan, dtype="O")
|
397 |
+
offsets = np.zeros(self.record_size, dtype="int64")
|
398 |
+
sizes = np.zeros(self.record_size, dtype="int64")
|
399 |
+
raws = np.full(self.record_size, np.nan, dtype="O")
|
400 |
+
nraw = 0
|
401 |
+
npath = 0
|
402 |
+
for j, data in partition.items():
|
403 |
+
if isinstance(data, list):
|
404 |
+
npath += 1
|
405 |
+
paths[j] = data[0]
|
406 |
+
if len(data) > 1:
|
407 |
+
offsets[j] = data[1]
|
408 |
+
sizes[j] = data[2]
|
409 |
+
else:
|
410 |
+
nraw += 1
|
411 |
+
raws[j] = kerchunk.df._proc_raw(data)
|
412 |
+
# TODO: only save needed columns
|
413 |
+
df = pd.DataFrame(
|
414 |
+
{
|
415 |
+
"path": paths,
|
416 |
+
"offset": offsets,
|
417 |
+
"size": sizes,
|
418 |
+
"raw": raws,
|
419 |
+
},
|
420 |
+
copy=False,
|
421 |
+
)
|
422 |
+
if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
|
423 |
+
df["path"] = df["path"].astype("category")
|
424 |
+
object_encoding = {"raw": "bytes", "path": "utf8"}
|
425 |
+
has_nulls = ["path", "raw"]
|
426 |
+
|
427 |
+
self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
|
428 |
+
df.to_parquet(
|
429 |
+
fn,
|
430 |
+
engine="fastparquet",
|
431 |
+
storage_options=storage_options
|
432 |
+
or getattr(self.fs, "storage_options", None),
|
433 |
+
compression="zstd",
|
434 |
+
index=False,
|
435 |
+
stats=False,
|
436 |
+
object_encoding=object_encoding,
|
437 |
+
has_nulls=has_nulls,
|
438 |
+
# **kwargs,
|
439 |
+
)
|
440 |
+
partition.clear()
|
441 |
+
self._items.pop((field, record))
|
442 |
+
|
443 |
+
def flush(self, base_url=None, storage_options=None):
|
444 |
+
"""Output any modified or deleted keys
|
445 |
+
|
446 |
+
Parameters
|
447 |
+
----------
|
448 |
+
base_url: str
|
449 |
+
Location of the output
|
450 |
+
"""
|
451 |
+
# write what we have so far and clear sub chunks
|
452 |
+
for thing in list(self._items):
|
453 |
+
if isinstance(thing, tuple):
|
454 |
+
field, record = thing
|
455 |
+
self.write(
|
456 |
+
field,
|
457 |
+
record,
|
458 |
+
base_url=base_url,
|
459 |
+
storage_options=storage_options,
|
460 |
+
)
|
461 |
+
|
462 |
+
# gather .zmetadata from self._items and write that too
|
463 |
+
for k in list(self._items):
|
464 |
+
if k != ".zmetadata" and ".z" in k:
|
465 |
+
self.zmetadata[k] = json.loads(self._items.pop(k))
|
466 |
+
met = {"metadata": self.zmetadata, "record_size": self.record_size}
|
467 |
+
self._items[".zmetadata"] = json.dumps(met).encode()
|
468 |
+
self.fs.pipe(
|
469 |
+
"/".join([base_url or self.out_root, ".zmetadata"]),
|
470 |
+
self._items[".zmetadata"],
|
471 |
+
)
|
472 |
+
|
473 |
+
# TODO: only clear those that we wrote to?
|
474 |
+
self.open_refs.cache_clear()
|
475 |
+
|
476 |
+
def __len__(self):
|
477 |
+
# Caveat: This counts expected references, not actual
|
478 |
+
count = 0
|
479 |
+
for field in self.listdir():
|
480 |
+
if field.startswith("."):
|
481 |
+
count += 1
|
482 |
+
else:
|
483 |
+
chunk_sizes = self._get_chunk_sizes(field)
|
484 |
+
nchunks = self.np.product(chunk_sizes)
|
485 |
+
count += nchunks
|
486 |
+
count += len(self.zmetadata) # all metadata keys
|
487 |
+
count += len(self._items) # the metadata file itself
|
488 |
+
return count
|
489 |
+
|
490 |
+
def __iter__(self):
|
491 |
+
# Caveat: Note that this generates all expected keys, but does not
|
492 |
+
# account for reference keys that are missing.
|
493 |
+
metas = set(self.zmetadata)
|
494 |
+
metas.update(self._items)
|
495 |
+
for bit in metas:
|
496 |
+
if isinstance(bit, str):
|
497 |
+
yield bit
|
498 |
+
for field in self.listdir():
|
499 |
+
yield from self._keys_in_field(field)
|
500 |
+
|
501 |
+
def __contains__(self, item):
|
502 |
+
try:
|
503 |
+
self._load_one_key(item)
|
504 |
+
return True
|
505 |
+
except KeyError:
|
506 |
+
return False
|
507 |
+
|
508 |
+
def _keys_in_field(self, field):
|
509 |
+
"""List key names in given field
|
510 |
+
|
511 |
+
Produces strings like "field/x.y" appropriate from the chunking of the array
|
512 |
+
"""
|
513 |
+
chunk_sizes = self._get_chunk_sizes(field)
|
514 |
+
if len(chunk_sizes) == 0:
|
515 |
+
yield field + "/0"
|
516 |
+
return
|
517 |
+
inds = itertools.product(*(range(i) for i in chunk_sizes))
|
518 |
+
for ind in inds:
|
519 |
+
yield field + "/" + ".".join([str(c) for c in ind])
|
520 |
+
|
521 |
+
|
522 |
+
class ReferenceFileSystem(AsyncFileSystem):
|
523 |
+
"""View byte ranges of some other file as a file system
|
524 |
+
Initial version: single file system target, which must support
|
525 |
+
async, and must allow start and end args in _cat_file. Later versions
|
526 |
+
may allow multiple arbitrary URLs for the targets.
|
527 |
+
This FileSystem is read-only. It is designed to be used with async
|
528 |
+
targets (for now). This FileSystem only allows whole-file access, no
|
529 |
+
``open``. We do not get original file details from the target FS.
|
530 |
+
Configuration is by passing a dict of references at init, or a URL to
|
531 |
+
a JSON file containing the same; this dict
|
532 |
+
can also contain concrete data for some set of paths.
|
533 |
+
Reference dict format:
|
534 |
+
{path0: bytes_data, path1: (target_url, offset, size)}
|
535 |
+
https://github.com/fsspec/kerchunk/blob/main/README.md
|
536 |
+
"""
|
537 |
+
|
538 |
+
protocol = "reference"
|
539 |
+
|
540 |
+
def __init__(
|
541 |
+
self,
|
542 |
+
fo,
|
543 |
+
target=None,
|
544 |
+
ref_storage_args=None,
|
545 |
+
target_protocol=None,
|
546 |
+
target_options=None,
|
547 |
+
remote_protocol=None,
|
548 |
+
remote_options=None,
|
549 |
+
fs=None,
|
550 |
+
template_overrides=None,
|
551 |
+
simple_templates=True,
|
552 |
+
max_gap=64_000,
|
553 |
+
max_block=256_000_000,
|
554 |
+
cache_size=128,
|
555 |
+
**kwargs,
|
556 |
+
):
|
557 |
+
"""
|
558 |
+
Parameters
|
559 |
+
----------
|
560 |
+
fo : dict or str
|
561 |
+
The set of references to use for this instance, with a structure as above.
|
562 |
+
If str referencing a JSON file, will use fsspec.open, in conjunction
|
563 |
+
with target_options and target_protocol to open and parse JSON at this
|
564 |
+
location. If a directory, then assume references are a set of parquet
|
565 |
+
files to be loaded lazily.
|
566 |
+
target : str
|
567 |
+
For any references having target_url as None, this is the default file
|
568 |
+
target to use
|
569 |
+
ref_storage_args : dict
|
570 |
+
If references is a str, use these kwargs for loading the JSON file.
|
571 |
+
Deprecated: use target_options instead.
|
572 |
+
target_protocol : str
|
573 |
+
Used for loading the reference file, if it is a path. If None, protocol
|
574 |
+
will be derived from the given path
|
575 |
+
target_options : dict
|
576 |
+
Extra FS options for loading the reference file ``fo``, if given as a path
|
577 |
+
remote_protocol : str
|
578 |
+
The protocol of the filesystem on which the references will be evaluated
|
579 |
+
(unless fs is provided). If not given, will be derived from the first
|
580 |
+
URL that has a protocol in the templates or in the references, in that
|
581 |
+
order.
|
582 |
+
remote_options : dict
|
583 |
+
kwargs to go with remote_protocol
|
584 |
+
fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
|
585 |
+
Directly provide a file system(s):
|
586 |
+
- a single filesystem instance
|
587 |
+
- a dict of protocol:filesystem, where each value is either a filesystem
|
588 |
+
instance, or a dict of kwargs that can be used to create in
|
589 |
+
instance for the given protocol
|
590 |
+
|
591 |
+
If this is given, remote_options and remote_protocol are ignored.
|
592 |
+
template_overrides : dict
|
593 |
+
Swap out any templates in the references file with these - useful for
|
594 |
+
testing.
|
595 |
+
simple_templates: bool
|
596 |
+
Whether templates can be processed with simple replace (True) or if
|
597 |
+
jinja is needed (False, much slower). All reference sets produced by
|
598 |
+
``kerchunk`` are simple in this sense, but the spec allows for complex.
|
599 |
+
max_gap, max_block: int
|
600 |
+
For merging multiple concurrent requests to the same remote file.
|
601 |
+
Neighboring byte ranges will only be merged when their
|
602 |
+
inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
|
603 |
+
to only merge when it requires no extra bytes. Pass a negative
|
604 |
+
number to disable merging, appropriate for local target files.
|
605 |
+
Neighboring byte ranges will only be merged when the size of
|
606 |
+
the aggregated range is <= ``max_block``. Default is 256MB.
|
607 |
+
cache_size : int
|
608 |
+
Maximum size of LRU cache, where cache_size*record_size denotes
|
609 |
+
the total number of references that can be loaded in memory at once.
|
610 |
+
Only used for lazily loaded references.
|
611 |
+
kwargs : passed to parent class
|
612 |
+
"""
|
613 |
+
super().__init__(**kwargs)
|
614 |
+
self.target = target
|
615 |
+
self.template_overrides = template_overrides
|
616 |
+
self.simple_templates = simple_templates
|
617 |
+
self.templates = {}
|
618 |
+
self.fss = {}
|
619 |
+
self._dircache = {}
|
620 |
+
self.max_gap = max_gap
|
621 |
+
self.max_block = max_block
|
622 |
+
if isinstance(fo, str):
|
623 |
+
dic = dict(
|
624 |
+
**(ref_storage_args or target_options or {}), protocol=target_protocol
|
625 |
+
)
|
626 |
+
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
|
627 |
+
if ref_fs.isfile(fo2):
|
628 |
+
# text JSON
|
629 |
+
with fsspec.open(fo, "rb", **dic) as f:
|
630 |
+
logger.info("Read reference from URL %s", fo)
|
631 |
+
text = json.load(f)
|
632 |
+
self._process_references(text, template_overrides)
|
633 |
+
else:
|
634 |
+
# Lazy parquet refs
|
635 |
+
logger.info("Open lazy reference dict from URL %s", fo)
|
636 |
+
self.references = LazyReferenceMapper(
|
637 |
+
fo2,
|
638 |
+
fs=ref_fs,
|
639 |
+
cache_size=cache_size,
|
640 |
+
)
|
641 |
+
else:
|
642 |
+
# dictionaries
|
643 |
+
self._process_references(fo, template_overrides)
|
644 |
+
if isinstance(fs, dict):
|
645 |
+
self.fss = {
|
646 |
+
k: (
|
647 |
+
fsspec.filesystem(k.split(":", 1)[0], **opts)
|
648 |
+
if isinstance(opts, dict)
|
649 |
+
else opts
|
650 |
+
)
|
651 |
+
for k, opts in fs.items()
|
652 |
+
}
|
653 |
+
if None not in self.fss:
|
654 |
+
self.fss[None] = filesystem("file")
|
655 |
+
return
|
656 |
+
if fs is not None:
|
657 |
+
# single remote FS
|
658 |
+
remote_protocol = (
|
659 |
+
fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
|
660 |
+
)
|
661 |
+
self.fss[remote_protocol] = fs
|
662 |
+
|
663 |
+
if remote_protocol is None:
|
664 |
+
# get single protocol from any templates
|
665 |
+
for ref in self.templates.values():
|
666 |
+
if callable(ref):
|
667 |
+
ref = ref()
|
668 |
+
protocol, _ = fsspec.core.split_protocol(ref)
|
669 |
+
if protocol and protocol not in self.fss:
|
670 |
+
fs = filesystem(protocol, **(remote_options or {}))
|
671 |
+
self.fss[protocol] = fs
|
672 |
+
if remote_protocol is None:
|
673 |
+
# get single protocol from references
|
674 |
+
# TODO: warning here, since this can be very expensive?
|
675 |
+
for ref in self.references.values():
|
676 |
+
if callable(ref):
|
677 |
+
ref = ref()
|
678 |
+
if isinstance(ref, list) and ref[0]:
|
679 |
+
protocol, _ = fsspec.core.split_protocol(ref[0])
|
680 |
+
if protocol not in self.fss:
|
681 |
+
fs = filesystem(protocol, **(remote_options or {}))
|
682 |
+
self.fss[protocol] = fs
|
683 |
+
# only use first remote URL
|
684 |
+
break
|
685 |
+
|
686 |
+
if remote_protocol and remote_protocol not in self.fss:
|
687 |
+
fs = filesystem(remote_protocol, **(remote_options or {}))
|
688 |
+
self.fss[remote_protocol] = fs
|
689 |
+
|
690 |
+
self.fss[None] = fs or filesystem("file") # default one
|
691 |
+
|
692 |
+
def _cat_common(self, path, start=None, end=None):
|
693 |
+
path = self._strip_protocol(path)
|
694 |
+
logger.debug(f"cat: {path}")
|
695 |
+
try:
|
696 |
+
part = self.references[path]
|
697 |
+
except KeyError:
|
698 |
+
raise FileNotFoundError(path)
|
699 |
+
if isinstance(part, str):
|
700 |
+
part = part.encode()
|
701 |
+
if isinstance(part, bytes):
|
702 |
+
logger.debug(f"Reference: {path}, type bytes")
|
703 |
+
if part.startswith(b"base64:"):
|
704 |
+
part = base64.b64decode(part[7:])
|
705 |
+
return part, None, None
|
706 |
+
|
707 |
+
if len(part) == 1:
|
708 |
+
logger.debug(f"Reference: {path}, whole file => {part}")
|
709 |
+
url = part[0]
|
710 |
+
start1, end1 = start, end
|
711 |
+
else:
|
712 |
+
url, start0, size = part
|
713 |
+
logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
|
714 |
+
end0 = start0 + size
|
715 |
+
|
716 |
+
if start is not None:
|
717 |
+
if start >= 0:
|
718 |
+
start1 = start0 + start
|
719 |
+
else:
|
720 |
+
start1 = end0 + start
|
721 |
+
else:
|
722 |
+
start1 = start0
|
723 |
+
if end is not None:
|
724 |
+
if end >= 0:
|
725 |
+
end1 = start0 + end
|
726 |
+
else:
|
727 |
+
end1 = end0 + end
|
728 |
+
else:
|
729 |
+
end1 = end0
|
730 |
+
if url is None:
|
731 |
+
url = self.target
|
732 |
+
return url, start1, end1
|
733 |
+
|
734 |
+
async def _cat_file(self, path, start=None, end=None, **kwargs):
|
735 |
+
part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
|
736 |
+
if isinstance(part_or_url, bytes):
|
737 |
+
return part_or_url[start:end]
|
738 |
+
protocol, _ = split_protocol(part_or_url)
|
739 |
+
try:
|
740 |
+
await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
|
741 |
+
except Exception as e:
|
742 |
+
raise ReferenceNotReachable(path, part_or_url) from e
|
743 |
+
|
744 |
+
def cat_file(self, path, start=None, end=None, **kwargs):
|
745 |
+
part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
|
746 |
+
if isinstance(part_or_url, bytes):
|
747 |
+
return part_or_url[start:end]
|
748 |
+
protocol, _ = split_protocol(part_or_url)
|
749 |
+
try:
|
750 |
+
return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
|
751 |
+
except Exception as e:
|
752 |
+
raise ReferenceNotReachable(path, part_or_url) from e
|
753 |
+
|
754 |
+
def pipe_file(self, path, value, **_):
|
755 |
+
"""Temporarily add binary data or reference as a file"""
|
756 |
+
self.references[path] = value
|
757 |
+
|
758 |
+
async def _get_file(self, rpath, lpath, **kwargs):
|
759 |
+
if self.isdir(rpath):
|
760 |
+
return os.makedirs(lpath, exist_ok=True)
|
761 |
+
data = await self._cat_file(rpath)
|
762 |
+
with open(lpath, "wb") as f:
|
763 |
+
f.write(data)
|
764 |
+
|
765 |
+
def get_file(self, rpath, lpath, callback=_DEFAULT_CALLBACK, **kwargs):
|
766 |
+
if self.isdir(rpath):
|
767 |
+
return os.makedirs(lpath, exist_ok=True)
|
768 |
+
data = self.cat_file(rpath, **kwargs)
|
769 |
+
callback.set_size(len(data))
|
770 |
+
if isfilelike(lpath):
|
771 |
+
lpath.write(data)
|
772 |
+
else:
|
773 |
+
with open(lpath, "wb") as f:
|
774 |
+
f.write(data)
|
775 |
+
callback.absolute_update(len(data))
|
776 |
+
|
777 |
+
def get(self, rpath, lpath, recursive=False, **kwargs):
|
778 |
+
if recursive:
|
779 |
+
# trigger directory build
|
780 |
+
self.ls("")
|
781 |
+
rpath = self.expand_path(rpath, recursive=recursive)
|
782 |
+
fs = fsspec.filesystem("file", auto_mkdir=True)
|
783 |
+
targets = other_paths(rpath, lpath)
|
784 |
+
if recursive:
|
785 |
+
data = self.cat([r for r in rpath if not self.isdir(r)])
|
786 |
+
else:
|
787 |
+
data = self.cat(rpath)
|
788 |
+
for remote, local in zip(rpath, targets):
|
789 |
+
if remote in data:
|
790 |
+
fs.pipe_file(local, data[remote])
|
791 |
+
|
792 |
+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
793 |
+
if isinstance(path, str) and recursive:
|
794 |
+
raise NotImplementedError
|
795 |
+
if isinstance(path, list) and (recursive or any("*" in p for p in path)):
|
796 |
+
raise NotImplementedError
|
797 |
+
# TODO: if references is lazy, pre-fetch all paths in batch before access
|
798 |
+
proto_dict = _protocol_groups(path, self.references)
|
799 |
+
out = {}
|
800 |
+
for proto, paths in proto_dict.items():
|
801 |
+
fs = self.fss[proto]
|
802 |
+
urls, starts, ends, valid_paths = [], [], [], []
|
803 |
+
for p in paths:
|
804 |
+
# find references or label not-found. Early exit if any not
|
805 |
+
# found and on_error is "raise"
|
806 |
+
try:
|
807 |
+
u, s, e = self._cat_common(p)
|
808 |
+
except FileNotFoundError as err:
|
809 |
+
if on_error == "raise":
|
810 |
+
raise
|
811 |
+
if on_error != "omit":
|
812 |
+
out[p] = err
|
813 |
+
else:
|
814 |
+
urls.append(u)
|
815 |
+
starts.append(s)
|
816 |
+
ends.append(e)
|
817 |
+
valid_paths.append(p)
|
818 |
+
|
819 |
+
# process references into form for merging
|
820 |
+
urls2 = []
|
821 |
+
starts2 = []
|
822 |
+
ends2 = []
|
823 |
+
paths2 = []
|
824 |
+
whole_files = set()
|
825 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
826 |
+
if isinstance(u, bytes):
|
827 |
+
# data
|
828 |
+
out[p] = u
|
829 |
+
elif s is None:
|
830 |
+
# whole file - limits are None, None, but no further
|
831 |
+
# entries take for this file
|
832 |
+
whole_files.add(u)
|
833 |
+
urls2.append(u)
|
834 |
+
starts2.append(s)
|
835 |
+
ends2.append(e)
|
836 |
+
paths2.append(p)
|
837 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
838 |
+
# second run to account for files that are to be loaded whole
|
839 |
+
if s is not None and u not in whole_files:
|
840 |
+
urls2.append(u)
|
841 |
+
starts2.append(s)
|
842 |
+
ends2.append(e)
|
843 |
+
paths2.append(p)
|
844 |
+
|
845 |
+
# merge and fetch consolidated ranges
|
846 |
+
new_paths, new_starts, new_ends = merge_offset_ranges(
|
847 |
+
list(urls2),
|
848 |
+
list(starts2),
|
849 |
+
list(ends2),
|
850 |
+
sort=True,
|
851 |
+
max_gap=self.max_gap,
|
852 |
+
max_block=self.max_block,
|
853 |
+
)
|
854 |
+
bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
|
855 |
+
|
856 |
+
# unbundle from merged bytes - simple approach
|
857 |
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
858 |
+
if p in out:
|
859 |
+
continue # was bytes, already handled
|
860 |
+
for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
|
861 |
+
if np == u and (ns is None or ne is None):
|
862 |
+
if isinstance(b, Exception):
|
863 |
+
out[p] = b
|
864 |
+
else:
|
865 |
+
out[p] = b[s:e]
|
866 |
+
elif np == u and s >= ns and e <= ne:
|
867 |
+
if isinstance(b, Exception):
|
868 |
+
out[p] = b
|
869 |
+
else:
|
870 |
+
out[p] = b[s - ns : (e - ne) or None]
|
871 |
+
|
872 |
+
for k, v in out.copy().items():
|
873 |
+
# these were valid references, but fetch failed, so transform exc
|
874 |
+
if isinstance(v, Exception) and k in self.references:
|
875 |
+
ex = out[k]
|
876 |
+
new_ex = ReferenceNotReachable(k, self.references[k])
|
877 |
+
new_ex.__cause__ = ex
|
878 |
+
if on_error == "raise":
|
879 |
+
raise new_ex
|
880 |
+
elif on_error != "omit":
|
881 |
+
out[k] = new_ex
|
882 |
+
|
883 |
+
if len(out) == 1 and isinstance(path, str) and "*" not in path:
|
884 |
+
return _first(out)
|
885 |
+
return out
|
886 |
+
|
887 |
+
def _process_references(self, references, template_overrides=None):
|
888 |
+
vers = references.get("version", None)
|
889 |
+
if vers is None:
|
890 |
+
self._process_references0(references)
|
891 |
+
elif vers == 1:
|
892 |
+
self._process_references1(references, template_overrides=template_overrides)
|
893 |
+
else:
|
894 |
+
raise ValueError(f"Unknown reference spec version: {vers}")
|
895 |
+
# TODO: we make dircache by iterating over all entries, but for Spec >= 1,
|
896 |
+
# can replace with programmatic. Is it even needed for mapper interface?
|
897 |
+
|
898 |
+
def _process_references0(self, references):
|
899 |
+
"""Make reference dict for Spec Version 0"""
|
900 |
+
self.references = references
|
901 |
+
|
902 |
+
def _process_references1(self, references, template_overrides=None):
|
903 |
+
if not self.simple_templates or self.templates:
|
904 |
+
import jinja2
|
905 |
+
self.references = {}
|
906 |
+
self._process_templates(references.get("templates", {}))
|
907 |
+
|
908 |
+
@lru_cache(1000)
|
909 |
+
def _render_jinja(u):
|
910 |
+
return jinja2.Template(u).render(**self.templates)
|
911 |
+
|
912 |
+
for k, v in references.get("refs", {}).items():
|
913 |
+
if isinstance(v, str):
|
914 |
+
if v.startswith("base64:"):
|
915 |
+
self.references[k] = base64.b64decode(v[7:])
|
916 |
+
self.references[k] = v
|
917 |
+
elif self.templates:
|
918 |
+
u = v[0]
|
919 |
+
if "{{" in u:
|
920 |
+
if self.simple_templates:
|
921 |
+
u = (
|
922 |
+
u.replace("{{", "{")
|
923 |
+
.replace("}}", "}")
|
924 |
+
.format(**self.templates)
|
925 |
+
)
|
926 |
+
else:
|
927 |
+
u = _render_jinja(u)
|
928 |
+
self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
|
929 |
+
else:
|
930 |
+
self.references[k] = v
|
931 |
+
self.references.update(self._process_gen(references.get("gen", [])))
|
932 |
+
|
933 |
+
def _process_templates(self, tmp):
|
934 |
+
self.templates = {}
|
935 |
+
if self.template_overrides is not None:
|
936 |
+
tmp.update(self.template_overrides)
|
937 |
+
for k, v in tmp.items():
|
938 |
+
if "{{" in v:
|
939 |
+
import jinja2
|
940 |
+
|
941 |
+
self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
|
942 |
+
temp
|
943 |
+
).render(**kwargs)
|
944 |
+
else:
|
945 |
+
self.templates[k] = v
|
946 |
+
|
947 |
+
def _process_gen(self, gens):
|
948 |
+
out = {}
|
949 |
+
for gen in gens:
|
950 |
+
dimension = {
|
951 |
+
k: v
|
952 |
+
if isinstance(v, list)
|
953 |
+
else range(v.get("start", 0), v["stop"], v.get("step", 1))
|
954 |
+
for k, v in gen["dimensions"].items()
|
955 |
+
}
|
956 |
+
products = (
|
957 |
+
dict(zip(dimension.keys(), values))
|
958 |
+
for values in itertools.product(*dimension.values())
|
959 |
+
)
|
960 |
+
for pr in products:
|
961 |
+
import jinja2
|
962 |
+
|
963 |
+
key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
|
964 |
+
url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
|
965 |
+
if ("offset" in gen) and ("length" in gen):
|
966 |
+
offset = int(
|
967 |
+
jinja2.Template(gen["offset"]).render(**pr, **self.templates)
|
968 |
+
)
|
969 |
+
length = int(
|
970 |
+
jinja2.Template(gen["length"]).render(**pr, **self.templates)
|
971 |
+
)
|
972 |
+
out[key] = [url, offset, length]
|
973 |
+
elif ("offset" in gen) ^ ("length" in gen):
|
974 |
+
raise ValueError(
|
975 |
+
"Both 'offset' and 'length' are required for a "
|
976 |
+
"reference generator entry if either is provided."
|
977 |
+
)
|
978 |
+
else:
|
979 |
+
out[key] = [url]
|
980 |
+
return out
|
981 |
+
|
982 |
+
def _dircache_from_items(self):
|
983 |
+
self.dircache = {"": []}
|
984 |
+
it = self.references.items()
|
985 |
+
for path, part in it:
|
986 |
+
if isinstance(part, (bytes, str)):
|
987 |
+
size = len(part)
|
988 |
+
elif len(part) == 1:
|
989 |
+
size = None
|
990 |
+
else:
|
991 |
+
_, _, size = part
|
992 |
+
par = path.rsplit("/", 1)[0] if "/" in path else ""
|
993 |
+
par0 = par
|
994 |
+
subdirs = [par0]
|
995 |
+
while par0 and par0 not in self.dircache:
|
996 |
+
# collect parent directories
|
997 |
+
par0 = self._parent(par0)
|
998 |
+
subdirs.append(par0)
|
999 |
+
|
1000 |
+
subdirs = subdirs[::-1]
|
1001 |
+
for parent, child in zip(subdirs, subdirs[1:]):
|
1002 |
+
# register newly discovered directories
|
1003 |
+
assert child not in self.dircache
|
1004 |
+
assert parent in self.dircache
|
1005 |
+
self.dircache[parent].append(
|
1006 |
+
{"name": child, "type": "directory", "size": 0}
|
1007 |
+
)
|
1008 |
+
self.dircache[child] = []
|
1009 |
+
|
1010 |
+
self.dircache[par].append({"name": path, "type": "file", "size": size})
|
1011 |
+
|
1012 |
+
def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
|
1013 |
+
data = self.cat_file(path) # load whole chunk into memory
|
1014 |
+
return io.BytesIO(data)
|
1015 |
+
|
1016 |
+
def ls(self, path, detail=True, **kwargs):
|
1017 |
+
path = self._strip_protocol(path)
|
1018 |
+
if isinstance(self.references, LazyReferenceMapper):
|
1019 |
+
try:
|
1020 |
+
return self.references.ls(path, detail)
|
1021 |
+
except KeyError:
|
1022 |
+
pass
|
1023 |
+
raise FileNotFoundError(f"'{path}' is not a known key")
|
1024 |
+
if not self.dircache:
|
1025 |
+
self._dircache_from_items()
|
1026 |
+
out = self._ls_from_cache(path)
|
1027 |
+
if out is None:
|
1028 |
+
raise FileNotFoundError(path)
|
1029 |
+
if detail:
|
1030 |
+
return out
|
1031 |
+
return [o["name"] for o in out]
|
1032 |
+
|
1033 |
+
def exists(self, path, **kwargs): # overwrite auto-sync version
|
1034 |
+
return self.isdir(path) or self.isfile(path)
|
1035 |
+
|
1036 |
+
def isdir(self, path): # overwrite auto-sync version
|
1037 |
+
if self.dircache:
|
1038 |
+
return path in self.dircache
|
1039 |
+
elif isinstance(self.references, LazyReferenceMapper):
|
1040 |
+
return path in self.references.listdir("")
|
1041 |
+
else:
|
1042 |
+
# this may be faster than building dircache for single calls, but
|
1043 |
+
# by looping will be slow for many calls; could cache it?
|
1044 |
+
return any(_.startswith(f"{path}/") for _ in self.references)
|
1045 |
+
|
1046 |
+
def isfile(self, path): # overwrite auto-sync version
|
1047 |
+
return path in self.references
|
1048 |
+
|
1049 |
+
async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
|
1050 |
+
return self.ls(path, detail, **kwargs)
|
1051 |
+
|
1052 |
+
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
1053 |
+
if withdirs:
|
1054 |
+
return super().find(
|
1055 |
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
|
1056 |
+
)
|
1057 |
+
if path:
|
1058 |
+
path = self._strip_protocol(path)
|
1059 |
+
r = sorted(k for k in self.references if k.startswith(path))
|
1060 |
+
else:
|
1061 |
+
r = sorted(self.references)
|
1062 |
+
if detail:
|
1063 |
+
if not self.dircache:
|
1064 |
+
self._dircache_from_items()
|
1065 |
+
return {k: self._ls_from_cache(k)[0] for k in r}
|
1066 |
+
else:
|
1067 |
+
return r
|
1068 |
+
|
1069 |
+
def info(self, path, **kwargs):
|
1070 |
+
out = self.references.get(path)
|
1071 |
+
if out is not None:
|
1072 |
+
if isinstance(out, (str, bytes)):
|
1073 |
+
# decode base64 here
|
1074 |
+
return {"name": path, "type": "file", "size": len(out)}
|
1075 |
+
elif len(out) > 1:
|
1076 |
+
return {"name": path, "type": "file", "size": out[2]}
|
1077 |
+
else:
|
1078 |
+
out0 = [{"name": path, "type": "file", "size": None}]
|
1079 |
+
else:
|
1080 |
+
out = self.ls(path, True)
|
1081 |
+
out0 = [o for o in out if o["name"] == path]
|
1082 |
+
if not out0:
|
1083 |
+
return {"name": path, "type": "directory", "size": 0}
|
1084 |
+
if out0[0]["size"] is None:
|
1085 |
+
# if this is a whole remote file, update size using remote FS
|
1086 |
+
prot, _ = split_protocol(self.references[path][0])
|
1087 |
+
out0[0]["size"] = self.fss[prot].size(self.references[path][0])
|
1088 |
+
return out0[0]
|
1089 |
+
|
1090 |
+
async def _info(self, path, **kwargs): # calls fast sync code
|
1091 |
+
return self.info(path)
|
1092 |
+
|
1093 |
+
async def _rm_file(self, path, **kwargs):
|
1094 |
+
self.references.pop(
|
1095 |
+
path, None
|
1096 |
+
) # ignores FileNotFound, just as well for directories
|
1097 |
+
self.dircache.clear() # this is a bit heavy handed
|
1098 |
+
|
1099 |
+
async def _pipe_file(self, path, data):
|
1100 |
+
# can be str or bytes
|
1101 |
+
self.references[path] = data
|
1102 |
+
self.dircache.clear() # this is a bit heavy handed
|
1103 |
+
|
1104 |
+
async def _put_file(self, lpath, rpath):
|
1105 |
+
# puts binary
|
1106 |
+
with open(lpath, "rb") as f:
|
1107 |
+
self.references[rpath] = f.read()
|
1108 |
+
self.dircache.clear() # this is a bit heavy handed
|
1109 |
+
|
1110 |
+
def save_json(self, url, **storage_options):
|
1111 |
+
"""Write modified references into new location"""
|
1112 |
+
out = {}
|
1113 |
+
for k, v in self.references.items():
|
1114 |
+
if isinstance(v, bytes):
|
1115 |
+
try:
|
1116 |
+
out[k] = v.decode("ascii")
|
1117 |
+
except UnicodeDecodeError:
|
1118 |
+
out[k] = (b"base64:" + base64.b64encode(v)).decode()
|
1119 |
+
else:
|
1120 |
+
out[k] = v
|
1121 |
+
with fsspec.open(url, "wb", **storage_options) as f:
|
1122 |
+
f.write(json.dumps({"version": 1, "refs": out}).encode())
|
lib/python3.11/site-packages/fsspec/implementations/sftp.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import types
|
5 |
+
import uuid
|
6 |
+
from stat import S_ISDIR, S_ISLNK
|
7 |
+
|
8 |
+
import paramiko
|
9 |
+
|
10 |
+
from .. import AbstractFileSystem
|
11 |
+
from ..utils import infer_storage_options
|
12 |
+
|
13 |
+
logger = logging.getLogger("fsspec.sftp")
|
14 |
+
|
15 |
+
|
16 |
+
class SFTPFileSystem(AbstractFileSystem):
|
17 |
+
"""Files over SFTP/SSH
|
18 |
+
|
19 |
+
Peer-to-peer filesystem over SSH using paramiko.
|
20 |
+
|
21 |
+
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
22 |
+
there is no way to tell if a path is relative, so all paths are assumed
|
23 |
+
to be absolute.
|
24 |
+
"""
|
25 |
+
|
26 |
+
protocol = "sftp", "ssh"
|
27 |
+
|
28 |
+
def __init__(self, host, **ssh_kwargs):
|
29 |
+
"""
|
30 |
+
|
31 |
+
Parameters
|
32 |
+
----------
|
33 |
+
host: str
|
34 |
+
Hostname or IP as a string
|
35 |
+
temppath: str
|
36 |
+
Location on the server to put files, when within a transaction
|
37 |
+
ssh_kwargs: dict
|
38 |
+
Parameters passed on to connection. See details in
|
39 |
+
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
|
40 |
+
May include port, username, password...
|
41 |
+
"""
|
42 |
+
if self._cached:
|
43 |
+
return
|
44 |
+
super().__init__(**ssh_kwargs)
|
45 |
+
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
|
46 |
+
self.host = host
|
47 |
+
self.ssh_kwargs = ssh_kwargs
|
48 |
+
self._connect()
|
49 |
+
|
50 |
+
def _connect(self):
|
51 |
+
logger.debug("Connecting to SFTP server %s", self.host)
|
52 |
+
self.client = paramiko.SSHClient()
|
53 |
+
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
54 |
+
self.client.connect(self.host, **self.ssh_kwargs)
|
55 |
+
self.ftp = self.client.open_sftp()
|
56 |
+
|
57 |
+
@classmethod
|
58 |
+
def _strip_protocol(cls, path):
|
59 |
+
return infer_storage_options(path)["path"]
|
60 |
+
|
61 |
+
@staticmethod
|
62 |
+
def _get_kwargs_from_urls(urlpath):
|
63 |
+
out = infer_storage_options(urlpath)
|
64 |
+
out.pop("path", None)
|
65 |
+
out.pop("protocol", None)
|
66 |
+
return out
|
67 |
+
|
68 |
+
def mkdir(self, path, create_parents=False, mode=511):
|
69 |
+
logger.debug("Creating folder %s", path)
|
70 |
+
if self.exists(path):
|
71 |
+
raise FileExistsError(f"File exists: {path}")
|
72 |
+
|
73 |
+
if create_parents:
|
74 |
+
self.makedirs(path)
|
75 |
+
else:
|
76 |
+
self.ftp.mkdir(path, mode)
|
77 |
+
|
78 |
+
def makedirs(self, path, exist_ok=False, mode=511):
|
79 |
+
if self.exists(path) and not exist_ok:
|
80 |
+
raise FileExistsError(f"File exists: {path}")
|
81 |
+
|
82 |
+
parts = path.split("/")
|
83 |
+
new_path = "/" if path[:1] == "/" else ""
|
84 |
+
|
85 |
+
for part in parts:
|
86 |
+
if part:
|
87 |
+
new_path = f"{new_path}/{part}" if new_path else part
|
88 |
+
if not self.exists(new_path):
|
89 |
+
self.ftp.mkdir(new_path, mode)
|
90 |
+
|
91 |
+
def rmdir(self, path):
|
92 |
+
logger.debug("Removing folder %s", path)
|
93 |
+
self.ftp.rmdir(path)
|
94 |
+
|
95 |
+
def info(self, path):
|
96 |
+
stat = self._decode_stat(self.ftp.stat(path))
|
97 |
+
stat["name"] = path
|
98 |
+
return stat
|
99 |
+
|
100 |
+
@staticmethod
|
101 |
+
def _decode_stat(stat, parent_path=None):
|
102 |
+
if S_ISDIR(stat.st_mode):
|
103 |
+
t = "directory"
|
104 |
+
elif S_ISLNK(stat.st_mode):
|
105 |
+
t = "link"
|
106 |
+
else:
|
107 |
+
t = "file"
|
108 |
+
out = {
|
109 |
+
"name": "",
|
110 |
+
"size": stat.st_size,
|
111 |
+
"type": t,
|
112 |
+
"uid": stat.st_uid,
|
113 |
+
"gid": stat.st_gid,
|
114 |
+
"time": datetime.datetime.fromtimestamp(
|
115 |
+
stat.st_atime, tz=datetime.timezone.utc
|
116 |
+
),
|
117 |
+
"mtime": datetime.datetime.fromtimestamp(
|
118 |
+
stat.st_mtime, tz=datetime.timezone.utc
|
119 |
+
),
|
120 |
+
}
|
121 |
+
if parent_path:
|
122 |
+
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
|
123 |
+
return out
|
124 |
+
|
125 |
+
def ls(self, path, detail=False):
|
126 |
+
logger.debug("Listing folder %s", path)
|
127 |
+
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
128 |
+
if detail:
|
129 |
+
return stats
|
130 |
+
else:
|
131 |
+
paths = [stat["name"] for stat in stats]
|
132 |
+
return sorted(paths)
|
133 |
+
|
134 |
+
def put(self, lpath, rpath, callback=None, **kwargs):
|
135 |
+
logger.debug("Put file %s into %s", lpath, rpath)
|
136 |
+
self.ftp.put(lpath, rpath)
|
137 |
+
|
138 |
+
def get_file(self, rpath, lpath, **kwargs):
|
139 |
+
if self.isdir(rpath):
|
140 |
+
os.makedirs(lpath, exist_ok=True)
|
141 |
+
else:
|
142 |
+
self.ftp.get(self._strip_protocol(rpath), lpath)
|
143 |
+
|
144 |
+
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
145 |
+
"""
|
146 |
+
block_size: int or None
|
147 |
+
If 0, no buffering, if 1, line buffering, if >1, buffer that many
|
148 |
+
bytes, if None use default from paramiko.
|
149 |
+
"""
|
150 |
+
logger.debug("Opening file %s", path)
|
151 |
+
if kwargs.get("autocommit", True) is False:
|
152 |
+
# writes to temporary file, move on commit
|
153 |
+
path2 = "/".join([self.temppath, str(uuid.uuid4())])
|
154 |
+
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
|
155 |
+
f.temppath = path2
|
156 |
+
f.targetpath = path
|
157 |
+
f.fs = self
|
158 |
+
f.commit = types.MethodType(commit_a_file, f)
|
159 |
+
f.discard = types.MethodType(discard_a_file, f)
|
160 |
+
else:
|
161 |
+
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
|
162 |
+
return f
|
163 |
+
|
164 |
+
def _rm(self, path):
|
165 |
+
if self.isdir(path):
|
166 |
+
self.ftp.rmdir(path)
|
167 |
+
else:
|
168 |
+
self.ftp.remove(path)
|
169 |
+
|
170 |
+
def mv(self, old, new):
|
171 |
+
logger.debug("Renaming %s into %s", old, new)
|
172 |
+
self.ftp.posix_rename(old, new)
|
173 |
+
|
174 |
+
|
175 |
+
def commit_a_file(self):
|
176 |
+
self.fs.mv(self.temppath, self.targetpath)
|
177 |
+
|
178 |
+
|
179 |
+
def discard_a_file(self):
|
180 |
+
self.fs._rm(self.temppath)
|
lib/python3.11/site-packages/fsspec/implementations/smb.py
ADDED
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains SMBFileSystem class responsible for handling access to
|
3 |
+
Windows Samba network shares by using package smbprotocol
|
4 |
+
"""
|
5 |
+
|
6 |
+
import datetime
|
7 |
+
import uuid
|
8 |
+
from stat import S_ISDIR, S_ISLNK
|
9 |
+
|
10 |
+
import smbclient
|
11 |
+
|
12 |
+
from .. import AbstractFileSystem
|
13 |
+
from ..utils import infer_storage_options
|
14 |
+
|
15 |
+
# ! pylint: disable=bad-continuation
|
16 |
+
|
17 |
+
|
18 |
+
class SMBFileSystem(AbstractFileSystem):
|
19 |
+
"""Allow reading and writing to Windows and Samba network shares.
|
20 |
+
|
21 |
+
When using `fsspec.open()` for getting a file-like object the URI
|
22 |
+
should be specified as this format:
|
23 |
+
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
|
24 |
+
|
25 |
+
Example::
|
26 |
+
|
27 |
+
>>> import fsspec
|
28 |
+
>>> with fsspec.open(
|
29 |
+
... 'smb://myuser:[email protected]/' 'share/folder/file.csv'
|
30 |
+
... ) as smbfile:
|
31 |
+
... df = pd.read_csv(smbfile, sep='|', header=None)
|
32 |
+
|
33 |
+
Note that you need to pass in a valid hostname or IP address for the host
|
34 |
+
component of the URL. Do not use the Windows/NetBIOS machine name for the
|
35 |
+
host component.
|
36 |
+
|
37 |
+
The first component of the path in the URL points to the name of the shared
|
38 |
+
folder. Subsequent path components will point to the directory/folder/file.
|
39 |
+
|
40 |
+
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
|
41 |
+
optional.
|
42 |
+
|
43 |
+
.. note::
|
44 |
+
|
45 |
+
For working this source require `smbprotocol`_ to be installed, e.g.::
|
46 |
+
|
47 |
+
$ pip install smbprotocol
|
48 |
+
# or
|
49 |
+
# pip install smbprotocol[kerberos]
|
50 |
+
|
51 |
+
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
|
52 |
+
|
53 |
+
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
54 |
+
there is no way to tell if a path is relative, so all paths are assumed
|
55 |
+
to be absolute.
|
56 |
+
"""
|
57 |
+
|
58 |
+
protocol = "smb"
|
59 |
+
|
60 |
+
# pylint: disable=too-many-arguments
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
host,
|
64 |
+
port=None,
|
65 |
+
username=None,
|
66 |
+
password=None,
|
67 |
+
timeout=60,
|
68 |
+
encrypt=None,
|
69 |
+
share_access=None,
|
70 |
+
**kwargs,
|
71 |
+
):
|
72 |
+
"""
|
73 |
+
You can use _get_kwargs_from_urls to get some kwargs from
|
74 |
+
a reasonable SMB url.
|
75 |
+
|
76 |
+
Authentication will be anonymous or integrated if username/password are not
|
77 |
+
given.
|
78 |
+
|
79 |
+
Parameters
|
80 |
+
----------
|
81 |
+
host: str
|
82 |
+
The remote server name/ip to connect to
|
83 |
+
port: int or None
|
84 |
+
Port to connect with. Usually 445, sometimes 139.
|
85 |
+
username: str or None
|
86 |
+
Username to connect with. Required if Kerberos auth is not being used.
|
87 |
+
password: str or None
|
88 |
+
User's password on the server, if using username
|
89 |
+
timeout: int
|
90 |
+
Connection timeout in seconds
|
91 |
+
encrypt: bool
|
92 |
+
Whether to force encryption or not, once this has been set to True
|
93 |
+
the session cannot be changed back to False.
|
94 |
+
share_access: str or None
|
95 |
+
Specifies the default access applied to file open operations
|
96 |
+
performed with this file system object.
|
97 |
+
This affects whether other processes can concurrently open a handle
|
98 |
+
to the same file.
|
99 |
+
|
100 |
+
- None (the default): exclusively locks the file until closed.
|
101 |
+
- 'r': Allow other handles to be opened with read access.
|
102 |
+
- 'w': Allow other handles to be opened with write access.
|
103 |
+
- 'd': Allow other handles to be opened with delete access.
|
104 |
+
"""
|
105 |
+
super().__init__(**kwargs)
|
106 |
+
self.host = host
|
107 |
+
self.port = port
|
108 |
+
self.username = username
|
109 |
+
self.password = password
|
110 |
+
self.timeout = timeout
|
111 |
+
self.encrypt = encrypt
|
112 |
+
self.temppath = kwargs.pop("temppath", "")
|
113 |
+
self.share_access = share_access
|
114 |
+
self._connect()
|
115 |
+
|
116 |
+
@property
|
117 |
+
def _port(self):
|
118 |
+
return 445 if self.port is None else self.port
|
119 |
+
|
120 |
+
def _connect(self):
|
121 |
+
smbclient.register_session(
|
122 |
+
self.host,
|
123 |
+
username=self.username,
|
124 |
+
password=self.password,
|
125 |
+
port=self._port,
|
126 |
+
encrypt=self.encrypt,
|
127 |
+
connection_timeout=self.timeout,
|
128 |
+
)
|
129 |
+
|
130 |
+
@classmethod
|
131 |
+
def _strip_protocol(cls, path):
|
132 |
+
return infer_storage_options(path)["path"]
|
133 |
+
|
134 |
+
@staticmethod
|
135 |
+
def _get_kwargs_from_urls(path):
|
136 |
+
# smb://workgroup;user:password@host:port/share/folder/file.csv
|
137 |
+
out = infer_storage_options(path)
|
138 |
+
out.pop("path", None)
|
139 |
+
out.pop("protocol", None)
|
140 |
+
return out
|
141 |
+
|
142 |
+
def mkdir(self, path, create_parents=True, **kwargs):
|
143 |
+
wpath = _as_unc_path(self.host, path)
|
144 |
+
if create_parents:
|
145 |
+
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
|
146 |
+
else:
|
147 |
+
smbclient.mkdir(wpath, port=self._port, **kwargs)
|
148 |
+
|
149 |
+
def makedirs(self, path, exist_ok=False):
|
150 |
+
if _share_has_path(path):
|
151 |
+
wpath = _as_unc_path(self.host, path)
|
152 |
+
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
|
153 |
+
|
154 |
+
def rmdir(self, path):
|
155 |
+
if _share_has_path(path):
|
156 |
+
wpath = _as_unc_path(self.host, path)
|
157 |
+
smbclient.rmdir(wpath, port=self._port)
|
158 |
+
|
159 |
+
def info(self, path, **kwargs):
|
160 |
+
wpath = _as_unc_path(self.host, path)
|
161 |
+
stats = smbclient.stat(wpath, port=self._port, **kwargs)
|
162 |
+
if S_ISDIR(stats.st_mode):
|
163 |
+
stype = "directory"
|
164 |
+
elif S_ISLNK(stats.st_mode):
|
165 |
+
stype = "link"
|
166 |
+
else:
|
167 |
+
stype = "file"
|
168 |
+
res = {
|
169 |
+
"name": path + "/" if stype == "directory" else path,
|
170 |
+
"size": stats.st_size,
|
171 |
+
"type": stype,
|
172 |
+
"uid": stats.st_uid,
|
173 |
+
"gid": stats.st_gid,
|
174 |
+
"time": stats.st_atime,
|
175 |
+
"mtime": stats.st_mtime,
|
176 |
+
}
|
177 |
+
return res
|
178 |
+
|
179 |
+
def created(self, path):
|
180 |
+
"""Return the created timestamp of a file as a datetime.datetime"""
|
181 |
+
wpath = _as_unc_path(self.host, path)
|
182 |
+
stats = smbclient.stat(wpath, port=self._port)
|
183 |
+
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
|
184 |
+
|
185 |
+
def modified(self, path):
|
186 |
+
"""Return the modified timestamp of a file as a datetime.datetime"""
|
187 |
+
wpath = _as_unc_path(self.host, path)
|
188 |
+
stats = smbclient.stat(wpath, port=self._port)
|
189 |
+
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
|
190 |
+
|
191 |
+
def ls(self, path, detail=True, **kwargs):
|
192 |
+
unc = _as_unc_path(self.host, path)
|
193 |
+
listed = smbclient.listdir(unc, port=self._port, **kwargs)
|
194 |
+
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
|
195 |
+
if detail:
|
196 |
+
dirs = [self.info(d) for d in dirs]
|
197 |
+
return dirs
|
198 |
+
|
199 |
+
# pylint: disable=too-many-arguments
|
200 |
+
def _open(
|
201 |
+
self,
|
202 |
+
path,
|
203 |
+
mode="rb",
|
204 |
+
block_size=-1,
|
205 |
+
autocommit=True,
|
206 |
+
cache_options=None,
|
207 |
+
**kwargs,
|
208 |
+
):
|
209 |
+
"""
|
210 |
+
block_size: int or None
|
211 |
+
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
|
212 |
+
|
213 |
+
Notes
|
214 |
+
-----
|
215 |
+
By specifying 'share_access' in 'kwargs' it is possible to override the
|
216 |
+
default shared access setting applied in the constructor of this object.
|
217 |
+
"""
|
218 |
+
bls = block_size if block_size is not None and block_size >= 0 else -1
|
219 |
+
wpath = _as_unc_path(self.host, path)
|
220 |
+
share_access = kwargs.pop("share_access", self.share_access)
|
221 |
+
if "w" in mode and autocommit is False:
|
222 |
+
temp = _as_temp_path(self.host, path, self.temppath)
|
223 |
+
return SMBFileOpener(
|
224 |
+
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
|
225 |
+
)
|
226 |
+
return smbclient.open_file(
|
227 |
+
wpath,
|
228 |
+
mode,
|
229 |
+
buffering=bls,
|
230 |
+
share_access=share_access,
|
231 |
+
port=self._port,
|
232 |
+
**kwargs,
|
233 |
+
)
|
234 |
+
|
235 |
+
def copy(self, path1, path2, **kwargs):
|
236 |
+
"""Copy within two locations in the same filesystem"""
|
237 |
+
wpath1 = _as_unc_path(self.host, path1)
|
238 |
+
wpath2 = _as_unc_path(self.host, path2)
|
239 |
+
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
|
240 |
+
|
241 |
+
def _rm(self, path):
|
242 |
+
if _share_has_path(path):
|
243 |
+
wpath = _as_unc_path(self.host, path)
|
244 |
+
stats = smbclient.stat(wpath, port=self._port)
|
245 |
+
if S_ISDIR(stats.st_mode):
|
246 |
+
smbclient.rmdir(wpath, port=self._port)
|
247 |
+
else:
|
248 |
+
smbclient.remove(wpath, port=self._port)
|
249 |
+
|
250 |
+
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
|
251 |
+
wpath1 = _as_unc_path(self.host, path1)
|
252 |
+
wpath2 = _as_unc_path(self.host, path2)
|
253 |
+
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
|
254 |
+
|
255 |
+
|
256 |
+
def _as_unc_path(host, path):
|
257 |
+
rpath = path.replace("/", "\\")
|
258 |
+
unc = f"\\\\{host}{rpath}"
|
259 |
+
return unc
|
260 |
+
|
261 |
+
|
262 |
+
def _as_temp_path(host, path, temppath):
|
263 |
+
share = path.split("/")[1]
|
264 |
+
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
|
265 |
+
unc = _as_unc_path(host, temp_file)
|
266 |
+
return unc
|
267 |
+
|
268 |
+
|
269 |
+
def _share_has_path(path):
|
270 |
+
parts = path.count("/")
|
271 |
+
if path.endswith("/"):
|
272 |
+
return parts > 2
|
273 |
+
return parts > 1
|
274 |
+
|
275 |
+
|
276 |
+
class SMBFileOpener:
|
277 |
+
"""writes to remote temporary file, move on commit"""
|
278 |
+
|
279 |
+
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
|
280 |
+
self.path = path
|
281 |
+
self.temp = temp
|
282 |
+
self.mode = mode
|
283 |
+
self.block_size = block_size
|
284 |
+
self.kwargs = kwargs
|
285 |
+
self.smbfile = None
|
286 |
+
self._incontext = False
|
287 |
+
self.port = port
|
288 |
+
self._open()
|
289 |
+
|
290 |
+
def _open(self):
|
291 |
+
if self.smbfile is None or self.smbfile.closed:
|
292 |
+
self.smbfile = smbclient.open_file(
|
293 |
+
self.temp,
|
294 |
+
self.mode,
|
295 |
+
port=self.port,
|
296 |
+
buffering=self.block_size,
|
297 |
+
**self.kwargs,
|
298 |
+
)
|
299 |
+
|
300 |
+
def commit(self):
|
301 |
+
"""Move temp file to definitive on success."""
|
302 |
+
# TODO: use transaction support in SMB protocol
|
303 |
+
smbclient.replace(self.temp, self.path, port=self.port)
|
304 |
+
|
305 |
+
def discard(self):
|
306 |
+
"""Remove the temp file on failure."""
|
307 |
+
smbclient.remove(self.temp, port=self.port)
|
308 |
+
|
309 |
+
def __fspath__(self):
|
310 |
+
return self.path
|
311 |
+
|
312 |
+
def __iter__(self):
|
313 |
+
return self.smbfile.__iter__()
|
314 |
+
|
315 |
+
def __getattr__(self, item):
|
316 |
+
return getattr(self.smbfile, item)
|
317 |
+
|
318 |
+
def __enter__(self):
|
319 |
+
self._incontext = True
|
320 |
+
return self.smbfile.__enter__()
|
321 |
+
|
322 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
323 |
+
self._incontext = False
|
324 |
+
self.smbfile.__exit__(exc_type, exc_value, traceback)
|
lib/python3.11/site-packages/fsspec/implementations/tar.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import tarfile
|
3 |
+
|
4 |
+
import fsspec
|
5 |
+
from fsspec.archive import AbstractArchiveFileSystem
|
6 |
+
from fsspec.compression import compr
|
7 |
+
from fsspec.utils import infer_compression
|
8 |
+
|
9 |
+
typemap = {b"0": "file", b"5": "directory"}
|
10 |
+
|
11 |
+
logger = logging.getLogger("tar")
|
12 |
+
|
13 |
+
|
14 |
+
class TarFileSystem(AbstractArchiveFileSystem):
|
15 |
+
"""Compressed Tar archives as a file-system (read-only)
|
16 |
+
|
17 |
+
Supports the following formats:
|
18 |
+
tar.gz, tar.bz2, tar.xz
|
19 |
+
"""
|
20 |
+
|
21 |
+
root_marker = ""
|
22 |
+
protocol = "tar"
|
23 |
+
cachable = False
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
fo="",
|
28 |
+
index_store=None,
|
29 |
+
target_options=None,
|
30 |
+
target_protocol=None,
|
31 |
+
compression=None,
|
32 |
+
**kwargs,
|
33 |
+
):
|
34 |
+
super().__init__(**kwargs)
|
35 |
+
target_options = target_options or {}
|
36 |
+
|
37 |
+
if isinstance(fo, str):
|
38 |
+
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
39 |
+
fo = self.of.open() # keep the reference
|
40 |
+
|
41 |
+
# Try to infer compression.
|
42 |
+
if compression is None:
|
43 |
+
name = None
|
44 |
+
|
45 |
+
# Try different ways to get hold of the filename. `fo` might either
|
46 |
+
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
47 |
+
# `fsspec.AbstractFileSystem` instance.
|
48 |
+
try:
|
49 |
+
# Amended io.BufferedReader or similar.
|
50 |
+
# This uses a "protocol extension" where original filenames are
|
51 |
+
# propagated to archive-like filesystems in order to let them
|
52 |
+
# infer the right compression appropriately.
|
53 |
+
if hasattr(fo, "original"):
|
54 |
+
name = fo.original
|
55 |
+
|
56 |
+
# fsspec.LocalFileOpener
|
57 |
+
elif hasattr(fo, "path"):
|
58 |
+
name = fo.path
|
59 |
+
|
60 |
+
# io.BufferedReader
|
61 |
+
elif hasattr(fo, "name"):
|
62 |
+
name = fo.name
|
63 |
+
|
64 |
+
# fsspec.AbstractFileSystem
|
65 |
+
elif hasattr(fo, "info"):
|
66 |
+
name = fo.info()["name"]
|
67 |
+
|
68 |
+
except Exception as ex:
|
69 |
+
logger.warning(
|
70 |
+
f"Unable to determine file name, not inferring compression: {ex}"
|
71 |
+
)
|
72 |
+
|
73 |
+
if name is not None:
|
74 |
+
compression = infer_compression(name)
|
75 |
+
logger.info(f"Inferred compression {compression} from file name {name}")
|
76 |
+
|
77 |
+
if compression is not None:
|
78 |
+
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
79 |
+
# but then would seek to offset in the file work?
|
80 |
+
fo = compr[compression](fo)
|
81 |
+
|
82 |
+
self._fo_ref = fo
|
83 |
+
self.fo = fo # the whole instance is a context
|
84 |
+
self.tar = tarfile.TarFile(fileobj=self.fo)
|
85 |
+
self.dir_cache = None
|
86 |
+
|
87 |
+
self.index_store = index_store
|
88 |
+
self.index = None
|
89 |
+
self._index()
|
90 |
+
|
91 |
+
def _index(self):
|
92 |
+
# TODO: load and set saved index, if exists
|
93 |
+
out = {}
|
94 |
+
for ti in self.tar:
|
95 |
+
info = ti.get_info()
|
96 |
+
info["type"] = typemap.get(info["type"], "file")
|
97 |
+
name = ti.get_info()["name"].rstrip("/")
|
98 |
+
out[name] = (info, ti.offset_data)
|
99 |
+
|
100 |
+
self.index = out
|
101 |
+
# TODO: save index to self.index_store here, if set
|
102 |
+
|
103 |
+
def _get_dirs(self):
|
104 |
+
if self.dir_cache is not None:
|
105 |
+
return
|
106 |
+
|
107 |
+
# This enables ls to get directories as children as well as files
|
108 |
+
self.dir_cache = {
|
109 |
+
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
110 |
+
for dirname in self._all_dirnames(self.tar.getnames())
|
111 |
+
}
|
112 |
+
for member in self.tar.getmembers():
|
113 |
+
info = member.get_info()
|
114 |
+
info["name"] = info["name"].rstrip("/")
|
115 |
+
info["type"] = typemap.get(info["type"], "file")
|
116 |
+
self.dir_cache[info["name"]] = info
|
117 |
+
|
118 |
+
def _open(self, path, mode="rb", **kwargs):
|
119 |
+
if mode != "rb":
|
120 |
+
raise ValueError("Read-only filesystem implementation")
|
121 |
+
details, offset = self.index[path]
|
122 |
+
if details["type"] != "file":
|
123 |
+
raise ValueError("Can only handle regular files")
|
124 |
+
return self.tar.extractfile(path)
|
lib/python3.11/site-packages/fsspec/implementations/webhdfs.py
ADDED
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import secrets
|
6 |
+
import shutil
|
7 |
+
import tempfile
|
8 |
+
import uuid
|
9 |
+
from contextlib import suppress
|
10 |
+
from urllib.parse import quote
|
11 |
+
|
12 |
+
import requests
|
13 |
+
|
14 |
+
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
15 |
+
from ..utils import infer_storage_options, tokenize
|
16 |
+
|
17 |
+
logger = logging.getLogger("webhdfs")
|
18 |
+
|
19 |
+
|
20 |
+
class WebHDFS(AbstractFileSystem):
|
21 |
+
"""
|
22 |
+
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
23 |
+
|
24 |
+
Four auth mechanisms are supported:
|
25 |
+
|
26 |
+
insecure: no auth is done, and the user is assumed to be whoever they
|
27 |
+
say they are (parameter ``user``), or a predefined value such as
|
28 |
+
"dr.who" if not given
|
29 |
+
spnego: when kerberos authentication is enabled, auth is negotiated by
|
30 |
+
requests_kerberos https://github.com/requests/requests-kerberos .
|
31 |
+
This establishes a session based on existing kinit login and/or
|
32 |
+
specified principal/password; parameters are passed with ``kerb_kwargs``
|
33 |
+
token: uses an existing Hadoop delegation token from another secured
|
34 |
+
service. Indeed, this client can also generate such tokens when
|
35 |
+
not insecure. Note that tokens expire, but can be renewed (by a
|
36 |
+
previously specified user) and may allow for proxying.
|
37 |
+
basic-auth: used when both parameter ``user`` and parameter ``password``
|
38 |
+
are provided.
|
39 |
+
|
40 |
+
"""
|
41 |
+
|
42 |
+
tempdir = str(tempfile.gettempdir())
|
43 |
+
protocol = "webhdfs", "webHDFS"
|
44 |
+
|
45 |
+
def __init__(
|
46 |
+
self,
|
47 |
+
host,
|
48 |
+
port=50070,
|
49 |
+
kerberos=False,
|
50 |
+
token=None,
|
51 |
+
user=None,
|
52 |
+
password=None,
|
53 |
+
proxy_to=None,
|
54 |
+
kerb_kwargs=None,
|
55 |
+
data_proxy=None,
|
56 |
+
use_https=False,
|
57 |
+
**kwargs,
|
58 |
+
):
|
59 |
+
"""
|
60 |
+
Parameters
|
61 |
+
----------
|
62 |
+
host: str
|
63 |
+
Name-node address
|
64 |
+
port: int
|
65 |
+
Port for webHDFS
|
66 |
+
kerberos: bool
|
67 |
+
Whether to authenticate with kerberos for this connection
|
68 |
+
token: str or None
|
69 |
+
If given, use this token on every call to authenticate. A user
|
70 |
+
and user-proxy may be encoded in the token and should not be also
|
71 |
+
given
|
72 |
+
user: str or None
|
73 |
+
If given, assert the user name to connect with
|
74 |
+
password: str or None
|
75 |
+
If given, assert the password to use for basic auth. If password
|
76 |
+
is provided, user must be provided also
|
77 |
+
proxy_to: str or None
|
78 |
+
If given, the user has the authority to proxy, and this value is
|
79 |
+
the user in who's name actions are taken
|
80 |
+
kerb_kwargs: dict
|
81 |
+
Any extra arguments for HTTPKerberosAuth, see
|
82 |
+
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
|
83 |
+
data_proxy: dict, callable or None
|
84 |
+
If given, map data-node addresses. This can be necessary if the
|
85 |
+
HDFS cluster is behind a proxy, running on Docker or otherwise has
|
86 |
+
a mismatch between the host-names given by the name-node and the
|
87 |
+
address by which to refer to them from the client. If a dict,
|
88 |
+
maps host names ``host->data_proxy[host]``; if a callable, full
|
89 |
+
URLs are passed, and function must conform to
|
90 |
+
``url->data_proxy(url)``.
|
91 |
+
use_https: bool
|
92 |
+
Whether to connect to the Name-node using HTTPS instead of HTTP
|
93 |
+
kwargs
|
94 |
+
"""
|
95 |
+
if self._cached:
|
96 |
+
return
|
97 |
+
super().__init__(**kwargs)
|
98 |
+
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
|
99 |
+
self.kerb = kerberos
|
100 |
+
self.kerb_kwargs = kerb_kwargs or {}
|
101 |
+
self.pars = {}
|
102 |
+
self.proxy = data_proxy or {}
|
103 |
+
if token is not None:
|
104 |
+
if user is not None or proxy_to is not None:
|
105 |
+
raise ValueError(
|
106 |
+
"If passing a delegation token, must not set "
|
107 |
+
"user or proxy_to, as these are encoded in the"
|
108 |
+
" token"
|
109 |
+
)
|
110 |
+
self.pars["delegation"] = token
|
111 |
+
self.user = user
|
112 |
+
self.password = password
|
113 |
+
|
114 |
+
if password is not None:
|
115 |
+
if user is None:
|
116 |
+
raise ValueError(
|
117 |
+
"If passing a password, the user must also be"
|
118 |
+
"set in order to set up the basic-auth"
|
119 |
+
)
|
120 |
+
else:
|
121 |
+
if user is not None:
|
122 |
+
self.pars["user.name"] = user
|
123 |
+
|
124 |
+
if proxy_to is not None:
|
125 |
+
self.pars["doas"] = proxy_to
|
126 |
+
if kerberos and user is not None:
|
127 |
+
raise ValueError(
|
128 |
+
"If using Kerberos auth, do not specify the "
|
129 |
+
"user, this is handled by kinit."
|
130 |
+
)
|
131 |
+
self._connect()
|
132 |
+
|
133 |
+
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
134 |
+
|
135 |
+
@property
|
136 |
+
def fsid(self):
|
137 |
+
return self._fsid
|
138 |
+
|
139 |
+
def _connect(self):
|
140 |
+
self.session = requests.Session()
|
141 |
+
if self.kerb:
|
142 |
+
from requests_kerberos import HTTPKerberosAuth
|
143 |
+
|
144 |
+
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
145 |
+
|
146 |
+
if self.user is not None and self.password is not None:
|
147 |
+
from requests.auth import HTTPBasicAuth
|
148 |
+
|
149 |
+
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
150 |
+
|
151 |
+
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
152 |
+
url = self._apply_proxy(self.url + quote(path or "", safe="/="))
|
153 |
+
args = kwargs.copy()
|
154 |
+
args.update(self.pars)
|
155 |
+
args["op"] = op.upper()
|
156 |
+
logger.debug("sending %s with %s", url, method)
|
157 |
+
out = self.session.request(
|
158 |
+
method=method.upper(),
|
159 |
+
url=url,
|
160 |
+
params=args,
|
161 |
+
data=data,
|
162 |
+
allow_redirects=redirect,
|
163 |
+
)
|
164 |
+
if out.status_code in [400, 401, 403, 404, 500]:
|
165 |
+
try:
|
166 |
+
err = out.json()
|
167 |
+
msg = err["RemoteException"]["message"]
|
168 |
+
exp = err["RemoteException"]["exception"]
|
169 |
+
except (ValueError, KeyError):
|
170 |
+
pass
|
171 |
+
else:
|
172 |
+
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
|
173 |
+
raise ValueError(msg)
|
174 |
+
elif exp in ["SecurityException", "AccessControlException"]:
|
175 |
+
raise PermissionError(msg)
|
176 |
+
elif exp in ["FileNotFoundException"]:
|
177 |
+
raise FileNotFoundError(msg)
|
178 |
+
else:
|
179 |
+
raise RuntimeError(msg)
|
180 |
+
out.raise_for_status()
|
181 |
+
return out
|
182 |
+
|
183 |
+
def _open(
|
184 |
+
self,
|
185 |
+
path,
|
186 |
+
mode="rb",
|
187 |
+
block_size=None,
|
188 |
+
autocommit=True,
|
189 |
+
replication=None,
|
190 |
+
permissions=None,
|
191 |
+
**kwargs,
|
192 |
+
):
|
193 |
+
"""
|
194 |
+
|
195 |
+
Parameters
|
196 |
+
----------
|
197 |
+
path: str
|
198 |
+
File location
|
199 |
+
mode: str
|
200 |
+
'rb', 'wb', etc.
|
201 |
+
block_size: int
|
202 |
+
Client buffer size for read-ahead or write buffer
|
203 |
+
autocommit: bool
|
204 |
+
If False, writes to temporary file that only gets put in final
|
205 |
+
location upon commit
|
206 |
+
replication: int
|
207 |
+
Number of copies of file on the cluster, write mode only
|
208 |
+
permissions: str or int
|
209 |
+
posix permissions, write mode only
|
210 |
+
kwargs
|
211 |
+
|
212 |
+
Returns
|
213 |
+
-------
|
214 |
+
WebHDFile instance
|
215 |
+
"""
|
216 |
+
block_size = block_size or self.blocksize
|
217 |
+
return WebHDFile(
|
218 |
+
self,
|
219 |
+
path,
|
220 |
+
mode=mode,
|
221 |
+
block_size=block_size,
|
222 |
+
tempdir=self.tempdir,
|
223 |
+
autocommit=autocommit,
|
224 |
+
replication=replication,
|
225 |
+
permissions=permissions,
|
226 |
+
)
|
227 |
+
|
228 |
+
@staticmethod
|
229 |
+
def _process_info(info):
|
230 |
+
info["type"] = info["type"].lower()
|
231 |
+
info["size"] = info["length"]
|
232 |
+
return info
|
233 |
+
|
234 |
+
@classmethod
|
235 |
+
def _strip_protocol(cls, path):
|
236 |
+
return infer_storage_options(path)["path"]
|
237 |
+
|
238 |
+
@staticmethod
|
239 |
+
def _get_kwargs_from_urls(urlpath):
|
240 |
+
out = infer_storage_options(urlpath)
|
241 |
+
out.pop("path", None)
|
242 |
+
out.pop("protocol", None)
|
243 |
+
if "username" in out:
|
244 |
+
out["user"] = out.pop("username")
|
245 |
+
return out
|
246 |
+
|
247 |
+
def info(self, path):
|
248 |
+
out = self._call("GETFILESTATUS", path=path)
|
249 |
+
info = out.json()["FileStatus"]
|
250 |
+
info["name"] = path
|
251 |
+
return self._process_info(info)
|
252 |
+
|
253 |
+
def ls(self, path, detail=False):
|
254 |
+
out = self._call("LISTSTATUS", path=path)
|
255 |
+
infos = out.json()["FileStatuses"]["FileStatus"]
|
256 |
+
for info in infos:
|
257 |
+
self._process_info(info)
|
258 |
+
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
|
259 |
+
if detail:
|
260 |
+
return sorted(infos, key=lambda i: i["name"])
|
261 |
+
else:
|
262 |
+
return sorted(info["name"] for info in infos)
|
263 |
+
|
264 |
+
def content_summary(self, path):
|
265 |
+
"""Total numbers of files, directories and bytes under path"""
|
266 |
+
out = self._call("GETCONTENTSUMMARY", path=path)
|
267 |
+
return out.json()["ContentSummary"]
|
268 |
+
|
269 |
+
def ukey(self, path):
|
270 |
+
"""Checksum info of file, giving method and result"""
|
271 |
+
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
|
272 |
+
if "Location" in out.headers:
|
273 |
+
location = self._apply_proxy(out.headers["Location"])
|
274 |
+
out2 = self.session.get(location)
|
275 |
+
out2.raise_for_status()
|
276 |
+
return out2.json()["FileChecksum"]
|
277 |
+
else:
|
278 |
+
out.raise_for_status()
|
279 |
+
return out.json()["FileChecksum"]
|
280 |
+
|
281 |
+
def home_directory(self):
|
282 |
+
"""Get user's home directory"""
|
283 |
+
out = self._call("GETHOMEDIRECTORY")
|
284 |
+
return out.json()["Path"]
|
285 |
+
|
286 |
+
def get_delegation_token(self, renewer=None):
|
287 |
+
"""Retrieve token which can give the same authority to other uses
|
288 |
+
|
289 |
+
Parameters
|
290 |
+
----------
|
291 |
+
renewer: str or None
|
292 |
+
User who may use this token; if None, will be current user
|
293 |
+
"""
|
294 |
+
if renewer:
|
295 |
+
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
|
296 |
+
else:
|
297 |
+
out = self._call("GETDELEGATIONTOKEN")
|
298 |
+
t = out.json()["Token"]
|
299 |
+
if t is None:
|
300 |
+
raise ValueError("No token available for this user/security context")
|
301 |
+
return t["urlString"]
|
302 |
+
|
303 |
+
def renew_delegation_token(self, token):
|
304 |
+
"""Make token live longer. Returns new expiry time"""
|
305 |
+
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
|
306 |
+
return out.json()["long"]
|
307 |
+
|
308 |
+
def cancel_delegation_token(self, token):
|
309 |
+
"""Stop the token from being useful"""
|
310 |
+
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
|
311 |
+
|
312 |
+
def chmod(self, path, mod):
|
313 |
+
"""Set the permission at path
|
314 |
+
|
315 |
+
Parameters
|
316 |
+
----------
|
317 |
+
path: str
|
318 |
+
location to set (file or directory)
|
319 |
+
mod: str or int
|
320 |
+
posix epresentation or permission, give as oct string, e.g, '777'
|
321 |
+
or 0o777
|
322 |
+
"""
|
323 |
+
self._call("SETPERMISSION", method="put", path=path, permission=mod)
|
324 |
+
|
325 |
+
def chown(self, path, owner=None, group=None):
|
326 |
+
"""Change owning user and/or group"""
|
327 |
+
kwargs = {}
|
328 |
+
if owner is not None:
|
329 |
+
kwargs["owner"] = owner
|
330 |
+
if group is not None:
|
331 |
+
kwargs["group"] = group
|
332 |
+
self._call("SETOWNER", method="put", path=path, **kwargs)
|
333 |
+
|
334 |
+
def set_replication(self, path, replication):
|
335 |
+
"""
|
336 |
+
Set file replication factor
|
337 |
+
|
338 |
+
Parameters
|
339 |
+
----------
|
340 |
+
path: str
|
341 |
+
File location (not for directories)
|
342 |
+
replication: int
|
343 |
+
Number of copies of file on the cluster. Should be smaller than
|
344 |
+
number of data nodes; normally 3 on most systems.
|
345 |
+
"""
|
346 |
+
self._call("SETREPLICATION", path=path, method="put", replication=replication)
|
347 |
+
|
348 |
+
def mkdir(self, path, **kwargs):
|
349 |
+
self._call("MKDIRS", method="put", path=path)
|
350 |
+
|
351 |
+
def makedirs(self, path, exist_ok=False):
|
352 |
+
if exist_ok is False and self.exists(path):
|
353 |
+
raise FileExistsError(path)
|
354 |
+
self.mkdir(path)
|
355 |
+
|
356 |
+
def mv(self, path1, path2, **kwargs):
|
357 |
+
self._call("RENAME", method="put", path=path1, destination=path2)
|
358 |
+
|
359 |
+
def rm(self, path, recursive=False, **kwargs):
|
360 |
+
self._call(
|
361 |
+
"DELETE",
|
362 |
+
method="delete",
|
363 |
+
path=path,
|
364 |
+
recursive="true" if recursive else "false",
|
365 |
+
)
|
366 |
+
|
367 |
+
def rm_file(self, path, **kwargs):
|
368 |
+
self.rm(path)
|
369 |
+
|
370 |
+
def cp_file(self, lpath, rpath, **kwargs):
|
371 |
+
with self.open(lpath) as lstream:
|
372 |
+
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
|
373 |
+
# Perform an atomic copy (stream to a temporary file and
|
374 |
+
# move it to the actual destination).
|
375 |
+
try:
|
376 |
+
with self.open(tmp_fname, "wb") as rstream:
|
377 |
+
shutil.copyfileobj(lstream, rstream)
|
378 |
+
self.mv(tmp_fname, rpath)
|
379 |
+
except BaseException: # noqa
|
380 |
+
with suppress(FileNotFoundError):
|
381 |
+
self.rm(tmp_fname)
|
382 |
+
raise
|
383 |
+
|
384 |
+
def _apply_proxy(self, location):
|
385 |
+
if self.proxy and callable(self.proxy):
|
386 |
+
location = self.proxy(location)
|
387 |
+
elif self.proxy:
|
388 |
+
# as a dict
|
389 |
+
for k, v in self.proxy.items():
|
390 |
+
location = location.replace(k, v, 1)
|
391 |
+
return location
|
392 |
+
|
393 |
+
|
394 |
+
class WebHDFile(AbstractBufferedFile):
|
395 |
+
"""A file living in HDFS over webHDFS"""
|
396 |
+
|
397 |
+
def __init__(self, fs, path, **kwargs):
|
398 |
+
super().__init__(fs, path, **kwargs)
|
399 |
+
kwargs = kwargs.copy()
|
400 |
+
if kwargs.get("permissions", None) is None:
|
401 |
+
kwargs.pop("permissions", None)
|
402 |
+
if kwargs.get("replication", None) is None:
|
403 |
+
kwargs.pop("replication", None)
|
404 |
+
self.permissions = kwargs.pop("permissions", 511)
|
405 |
+
tempdir = kwargs.pop("tempdir")
|
406 |
+
if kwargs.pop("autocommit", False) is False:
|
407 |
+
self.target = self.path
|
408 |
+
self.path = os.path.join(tempdir, str(uuid.uuid4()))
|
409 |
+
|
410 |
+
def _upload_chunk(self, final=False):
|
411 |
+
"""Write one part of a multi-block file upload
|
412 |
+
|
413 |
+
Parameters
|
414 |
+
==========
|
415 |
+
final: bool
|
416 |
+
This is the last block, so should complete file, if
|
417 |
+
self.autocommit is True.
|
418 |
+
"""
|
419 |
+
out = self.fs.session.post(
|
420 |
+
self.location,
|
421 |
+
data=self.buffer.getvalue(),
|
422 |
+
headers={"content-type": "application/octet-stream"},
|
423 |
+
)
|
424 |
+
out.raise_for_status()
|
425 |
+
return True
|
426 |
+
|
427 |
+
def _initiate_upload(self):
|
428 |
+
"""Create remote file/upload"""
|
429 |
+
kwargs = self.kwargs.copy()
|
430 |
+
if "a" in self.mode:
|
431 |
+
op, method = "APPEND", "POST"
|
432 |
+
else:
|
433 |
+
op, method = "CREATE", "PUT"
|
434 |
+
kwargs["overwrite"] = "true"
|
435 |
+
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
|
436 |
+
location = self.fs._apply_proxy(out.headers["Location"])
|
437 |
+
if "w" in self.mode:
|
438 |
+
# create empty file to append to
|
439 |
+
out2 = self.fs.session.put(
|
440 |
+
location, headers={"content-type": "application/octet-stream"}
|
441 |
+
)
|
442 |
+
out2.raise_for_status()
|
443 |
+
# after creating empty file, change location to append to
|
444 |
+
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
|
445 |
+
self.location = self.fs._apply_proxy(out2.headers["Location"])
|
446 |
+
|
447 |
+
def _fetch_range(self, start, end):
|
448 |
+
start = max(start, 0)
|
449 |
+
end = min(self.size, end)
|
450 |
+
if start >= end or start >= self.size:
|
451 |
+
return b""
|
452 |
+
out = self.fs._call(
|
453 |
+
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
|
454 |
+
)
|
455 |
+
out.raise_for_status()
|
456 |
+
if "Location" in out.headers:
|
457 |
+
location = out.headers["Location"]
|
458 |
+
out2 = self.fs.session.get(self.fs._apply_proxy(location))
|
459 |
+
return out2.content
|
460 |
+
else:
|
461 |
+
return out.content
|
462 |
+
|
463 |
+
def commit(self):
|
464 |
+
self.fs.mv(self.path, self.target)
|
465 |
+
|
466 |
+
def discard(self):
|
467 |
+
self.fs.rm(self.path)
|
lib/python3.11/site-packages/fsspec/implementations/zip.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import zipfile
|
2 |
+
|
3 |
+
import fsspec
|
4 |
+
from fsspec.archive import AbstractArchiveFileSystem
|
5 |
+
|
6 |
+
|
7 |
+
class ZipFileSystem(AbstractArchiveFileSystem):
|
8 |
+
"""Read/Write contents of ZIP archive as a file-system
|
9 |
+
|
10 |
+
Keeps file object open while instance lives.
|
11 |
+
|
12 |
+
This class is pickleable, but not necessarily thread-safe
|
13 |
+
"""
|
14 |
+
|
15 |
+
root_marker = ""
|
16 |
+
protocol = "zip"
|
17 |
+
cachable = False
|
18 |
+
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
fo="",
|
22 |
+
mode="r",
|
23 |
+
target_protocol=None,
|
24 |
+
target_options=None,
|
25 |
+
compression=zipfile.ZIP_STORED,
|
26 |
+
allowZip64=True,
|
27 |
+
compresslevel=None,
|
28 |
+
**kwargs,
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Parameters
|
32 |
+
----------
|
33 |
+
fo: str or file-like
|
34 |
+
Contains ZIP, and must exist. If a str, will fetch file using
|
35 |
+
:meth:`~fsspec.open_files`, which must return one file exactly.
|
36 |
+
mode: str
|
37 |
+
Accept: "r", "w", "a"
|
38 |
+
target_protocol: str (optional)
|
39 |
+
If ``fo`` is a string, this value can be used to override the
|
40 |
+
FS protocol inferred from a URL
|
41 |
+
target_options: dict (optional)
|
42 |
+
Kwargs passed when instantiating the target FS, if ``fo`` is
|
43 |
+
a string.
|
44 |
+
compression, allowZip64, compresslevel: passed to ZipFile
|
45 |
+
Only relevant when creating a ZIP
|
46 |
+
"""
|
47 |
+
super().__init__(self, **kwargs)
|
48 |
+
if mode not in set("rwa"):
|
49 |
+
raise ValueError(f"mode '{mode}' no understood")
|
50 |
+
self.mode = mode
|
51 |
+
if isinstance(fo, str):
|
52 |
+
if mode == "a":
|
53 |
+
m = "r+b"
|
54 |
+
else:
|
55 |
+
m = mode + "b"
|
56 |
+
fo = fsspec.open(
|
57 |
+
fo, mode=m, protocol=target_protocol, **(target_options or {})
|
58 |
+
)
|
59 |
+
self.of = fo
|
60 |
+
self.fo = fo.__enter__() # the whole instance is a context
|
61 |
+
self.zip = zipfile.ZipFile(
|
62 |
+
self.fo,
|
63 |
+
mode=mode,
|
64 |
+
compression=compression,
|
65 |
+
allowZip64=allowZip64,
|
66 |
+
compresslevel=compresslevel,
|
67 |
+
)
|
68 |
+
self.dir_cache = None
|
69 |
+
|
70 |
+
@classmethod
|
71 |
+
def _strip_protocol(cls, path):
|
72 |
+
# zip file paths are always relative to the archive root
|
73 |
+
return super()._strip_protocol(path).lstrip("/")
|
74 |
+
|
75 |
+
def __del__(self):
|
76 |
+
if hasattr(self, "zip"):
|
77 |
+
self.close()
|
78 |
+
del self.zip
|
79 |
+
|
80 |
+
def close(self):
|
81 |
+
"""Commits any write changes to the file. Done on ``del`` too."""
|
82 |
+
self.zip.close()
|
83 |
+
|
84 |
+
def _get_dirs(self):
|
85 |
+
if self.dir_cache is None or self.mode in set("wa"):
|
86 |
+
# when writing, dir_cache is always in the ZipFile's attributes,
|
87 |
+
# not read from the file.
|
88 |
+
files = self.zip.infolist()
|
89 |
+
self.dir_cache = {
|
90 |
+
dirname.rstrip("/"): {
|
91 |
+
"name": dirname.rstrip("/"),
|
92 |
+
"size": 0,
|
93 |
+
"type": "directory",
|
94 |
+
}
|
95 |
+
for dirname in self._all_dirnames(self.zip.namelist())
|
96 |
+
}
|
97 |
+
for z in files:
|
98 |
+
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
|
99 |
+
f.update(
|
100 |
+
{
|
101 |
+
"name": z.filename.rstrip("/"),
|
102 |
+
"size": z.file_size,
|
103 |
+
"type": ("directory" if z.is_dir() else "file"),
|
104 |
+
}
|
105 |
+
)
|
106 |
+
self.dir_cache[f["name"]] = f
|
107 |
+
|
108 |
+
def pipe_file(self, path, value, **kwargs):
|
109 |
+
# override upstream, because we know the exact file size in this case
|
110 |
+
self.zip.writestr(path, value, **kwargs)
|
111 |
+
|
112 |
+
def _open(
|
113 |
+
self,
|
114 |
+
path,
|
115 |
+
mode="rb",
|
116 |
+
block_size=None,
|
117 |
+
autocommit=True,
|
118 |
+
cache_options=None,
|
119 |
+
**kwargs,
|
120 |
+
):
|
121 |
+
path = self._strip_protocol(path)
|
122 |
+
if "r" in mode and self.mode in set("wa"):
|
123 |
+
if self.exists(path):
|
124 |
+
raise OSError("ZipFS can only be open for reading or writing, not both")
|
125 |
+
raise FileNotFoundError(path)
|
126 |
+
if "r" in self.mode and "w" in mode:
|
127 |
+
raise OSError("ZipFS can only be open for reading or writing, not both")
|
128 |
+
out = self.zip.open(path, mode.strip("b"))
|
129 |
+
if "r" in mode:
|
130 |
+
info = self.info(path)
|
131 |
+
out.size = info["size"]
|
132 |
+
out.name = info["name"]
|
133 |
+
return out
|
lib/python3.11/site-packages/fsspec/mapping.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import array
|
2 |
+
import posixpath
|
3 |
+
import warnings
|
4 |
+
from collections.abc import MutableMapping
|
5 |
+
from functools import cached_property
|
6 |
+
|
7 |
+
from .core import url_to_fs
|
8 |
+
|
9 |
+
|
10 |
+
class FSMap(MutableMapping):
|
11 |
+
"""Wrap a FileSystem instance as a mutable wrapping.
|
12 |
+
|
13 |
+
The keys of the mapping become files under the given root, and the
|
14 |
+
values (which must be bytes) the contents of those files.
|
15 |
+
|
16 |
+
Parameters
|
17 |
+
----------
|
18 |
+
root: string
|
19 |
+
prefix for all the files
|
20 |
+
fs: FileSystem instance
|
21 |
+
check: bool (=True)
|
22 |
+
performs a touch at the location, to check for write access.
|
23 |
+
|
24 |
+
Examples
|
25 |
+
--------
|
26 |
+
>>> fs = FileSystem(**parameters) # doctest: +SKIP
|
27 |
+
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
|
28 |
+
or, more likely
|
29 |
+
>>> d = fs.get_mapper('my-data/path/')
|
30 |
+
|
31 |
+
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
|
32 |
+
>>> list(d.keys()) # doctest: +SKIP
|
33 |
+
['loc1']
|
34 |
+
>>> d['loc1'] # doctest: +SKIP
|
35 |
+
b'Hello World'
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
|
39 |
+
self.fs = fs
|
40 |
+
self.root = fs._strip_protocol(root).rstrip("/")
|
41 |
+
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
|
42 |
+
if missing_exceptions is None:
|
43 |
+
missing_exceptions = (
|
44 |
+
FileNotFoundError,
|
45 |
+
IsADirectoryError,
|
46 |
+
NotADirectoryError,
|
47 |
+
)
|
48 |
+
self.missing_exceptions = missing_exceptions
|
49 |
+
self.check = check
|
50 |
+
self.create = create
|
51 |
+
if create:
|
52 |
+
if not self.fs.exists(root):
|
53 |
+
self.fs.mkdir(root)
|
54 |
+
if check:
|
55 |
+
if not self.fs.exists(root):
|
56 |
+
raise ValueError(
|
57 |
+
f"Path {root} does not exist. Create "
|
58 |
+
f" with the ``create=True`` keyword"
|
59 |
+
)
|
60 |
+
self.fs.touch(root + "/a")
|
61 |
+
self.fs.rm(root + "/a")
|
62 |
+
|
63 |
+
@cached_property
|
64 |
+
def dirfs(self):
|
65 |
+
"""dirfs instance that can be used with the same keys as the mapper"""
|
66 |
+
from .implementations.dirfs import DirFileSystem
|
67 |
+
|
68 |
+
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
|
69 |
+
|
70 |
+
def clear(self):
|
71 |
+
"""Remove all keys below root - empties out mapping"""
|
72 |
+
try:
|
73 |
+
self.fs.rm(self.root, True)
|
74 |
+
self.fs.mkdir(self.root)
|
75 |
+
except: # noqa: E722
|
76 |
+
pass
|
77 |
+
|
78 |
+
def getitems(self, keys, on_error="raise"):
|
79 |
+
"""Fetch multiple items from the store
|
80 |
+
|
81 |
+
If the backend is async-able, this might proceed concurrently
|
82 |
+
|
83 |
+
Parameters
|
84 |
+
----------
|
85 |
+
keys: list(str)
|
86 |
+
They keys to be fetched
|
87 |
+
on_error : "raise", "omit", "return"
|
88 |
+
If raise, an underlying exception will be raised (converted to KeyError
|
89 |
+
if the type is in self.missing_exceptions); if omit, keys with exception
|
90 |
+
will simply not be included in the output; if "return", all keys are
|
91 |
+
included in the output, but the value will be bytes or an exception
|
92 |
+
instance.
|
93 |
+
|
94 |
+
Returns
|
95 |
+
-------
|
96 |
+
dict(key, bytes|exception)
|
97 |
+
"""
|
98 |
+
keys2 = [self._key_to_str(k) for k in keys]
|
99 |
+
oe = on_error if on_error == "raise" else "return"
|
100 |
+
try:
|
101 |
+
out = self.fs.cat(keys2, on_error=oe)
|
102 |
+
if isinstance(out, bytes):
|
103 |
+
out = {keys2[0]: out}
|
104 |
+
except self.missing_exceptions as e:
|
105 |
+
raise KeyError from e
|
106 |
+
out = {
|
107 |
+
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
|
108 |
+
for k, v in out.items()
|
109 |
+
}
|
110 |
+
return {
|
111 |
+
key: out[k2]
|
112 |
+
for key, k2 in zip(keys, keys2)
|
113 |
+
if on_error == "return" or not isinstance(out[k2], BaseException)
|
114 |
+
}
|
115 |
+
|
116 |
+
def setitems(self, values_dict):
|
117 |
+
"""Set the values of multiple items in the store
|
118 |
+
|
119 |
+
Parameters
|
120 |
+
----------
|
121 |
+
values_dict: dict(str, bytes)
|
122 |
+
"""
|
123 |
+
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
|
124 |
+
self.fs.pipe(values)
|
125 |
+
|
126 |
+
def delitems(self, keys):
|
127 |
+
"""Remove multiple keys from the store"""
|
128 |
+
self.fs.rm([self._key_to_str(k) for k in keys])
|
129 |
+
|
130 |
+
def _key_to_str(self, key):
|
131 |
+
"""Generate full path for the key"""
|
132 |
+
if not isinstance(key, str):
|
133 |
+
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
|
134 |
+
warnings.warn(
|
135 |
+
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
|
136 |
+
DeprecationWarning,
|
137 |
+
)
|
138 |
+
if isinstance(key, list):
|
139 |
+
key = tuple(key)
|
140 |
+
key = str(key)
|
141 |
+
return f"{self._root_key_to_str}{key}"
|
142 |
+
|
143 |
+
def _str_to_key(self, s):
|
144 |
+
"""Strip path of to leave key name"""
|
145 |
+
return s[len(self.root) :].lstrip("/")
|
146 |
+
|
147 |
+
def __getitem__(self, key, default=None):
|
148 |
+
"""Retrieve data"""
|
149 |
+
k = self._key_to_str(key)
|
150 |
+
try:
|
151 |
+
result = self.fs.cat(k)
|
152 |
+
except self.missing_exceptions:
|
153 |
+
if default is not None:
|
154 |
+
return default
|
155 |
+
raise KeyError(key)
|
156 |
+
return result
|
157 |
+
|
158 |
+
def pop(self, key, default=None):
|
159 |
+
"""Pop data"""
|
160 |
+
result = self.__getitem__(key, default)
|
161 |
+
try:
|
162 |
+
del self[key]
|
163 |
+
except KeyError:
|
164 |
+
pass
|
165 |
+
return result
|
166 |
+
|
167 |
+
def __setitem__(self, key, value):
|
168 |
+
"""Store value in key"""
|
169 |
+
key = self._key_to_str(key)
|
170 |
+
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
|
171 |
+
self.fs.pipe_file(key, maybe_convert(value))
|
172 |
+
|
173 |
+
def __iter__(self):
|
174 |
+
return (self._str_to_key(x) for x in self.fs.find(self.root))
|
175 |
+
|
176 |
+
def __len__(self):
|
177 |
+
return len(self.fs.find(self.root))
|
178 |
+
|
179 |
+
def __delitem__(self, key):
|
180 |
+
"""Remove key"""
|
181 |
+
try:
|
182 |
+
self.fs.rm(self._key_to_str(key))
|
183 |
+
except: # noqa: E722
|
184 |
+
raise KeyError
|
185 |
+
|
186 |
+
def __contains__(self, key):
|
187 |
+
"""Does key exist in mapping?"""
|
188 |
+
path = self._key_to_str(key)
|
189 |
+
return self.fs.exists(path) and self.fs.isfile(path)
|
190 |
+
|
191 |
+
def __reduce__(self):
|
192 |
+
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
|
193 |
+
|
194 |
+
|
195 |
+
def maybe_convert(value):
|
196 |
+
if isinstance(value, array.array) or hasattr(value, "__array__"):
|
197 |
+
# bytes-like things
|
198 |
+
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
|
199 |
+
# The buffer interface doesn't support datetime64/timdelta64 numpy
|
200 |
+
# arrays
|
201 |
+
value = value.view("int64")
|
202 |
+
value = bytes(memoryview(value))
|
203 |
+
return value
|
204 |
+
|
205 |
+
|
206 |
+
def get_mapper(
|
207 |
+
url="",
|
208 |
+
check=False,
|
209 |
+
create=False,
|
210 |
+
missing_exceptions=None,
|
211 |
+
alternate_root=None,
|
212 |
+
**kwargs,
|
213 |
+
):
|
214 |
+
"""Create key-value interface for given URL and options
|
215 |
+
|
216 |
+
The URL will be of the form "protocol://location" and point to the root
|
217 |
+
of the mapper required. All keys will be file-names below this location,
|
218 |
+
and their values the contents of each key.
|
219 |
+
|
220 |
+
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
|
221 |
+
|
222 |
+
Parameters
|
223 |
+
----------
|
224 |
+
url: str
|
225 |
+
Root URL of mapping
|
226 |
+
check: bool
|
227 |
+
Whether to attempt to read from the location before instantiation, to
|
228 |
+
check that the mapping does exist
|
229 |
+
create: bool
|
230 |
+
Whether to make the directory corresponding to the root before
|
231 |
+
instantiating
|
232 |
+
missing_exceptions: None or tuple
|
233 |
+
If given, these exception types will be regarded as missing keys and
|
234 |
+
return KeyError when trying to read data. By default, you get
|
235 |
+
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
|
236 |
+
alternate_root: None or str
|
237 |
+
In cases of complex URLs, the parser may fail to pick the correct part
|
238 |
+
for the mapper root, so this arg can override
|
239 |
+
|
240 |
+
Returns
|
241 |
+
-------
|
242 |
+
``FSMap`` instance, the dict-like key-value store.
|
243 |
+
"""
|
244 |
+
# Removing protocol here - could defer to each open() on the backend
|
245 |
+
fs, urlpath = url_to_fs(url, **kwargs)
|
246 |
+
root = alternate_root if alternate_root is not None else urlpath
|
247 |
+
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
|