Realtime-whisper-large-v3-turbo-german

Paused

App Files Files Community

KingNish commited on Oct 10, 2024

Commit

a80ba5c

verified ·

1 Parent(s): fc21d85

Delete spaces

Browse files

Files changed (18) hide show

spaces/__init__.py +0 -30
spaces/config.py +0 -37
spaces/gradio.py +0 -55
spaces/utils.py +0 -85
spaces/zero/__init__.py +0 -21
spaces/zero/api.py +0 -156
spaces/zero/client.py +0 -239
spaces/zero/decorator.py +0 -113
spaces/zero/gradio.py +0 -150
spaces/zero/torch/__init__.py +0 -42
spaces/zero/torch/bitsandbytes.py +0 -162
spaces/zero/torch/packing.py +0 -209
spaces/zero/torch/patching.py +0 -386
spaces/zero/torch/patching_legacy.py +0 -266
spaces/zero/torch/types.py +0 -23
spaces/zero/tqdm.py +0 -24
spaces/zero/types.py +0 -49
spaces/zero/wrappers.py +0 -418

spaces/__init__.py DELETED Viewed

@@ -1,30 +0,0 @@
-"""
-"""
-import sys
-if sys.version_info.minor < 8: # pragma: no cover
-    raise RuntimeError("Importing PySpaces requires Python 3.8+")
-# Prevent gradio from importing spaces
-if (gr := sys.modules.get('gradio')) is not None: # pragma: no cover
-    try:
-        gr.Blocks
-    except AttributeError:
-        raise ImportError
-from .zero.decorator import GPU
-from .gradio import gradio_auto_wrap
-from .gradio import disable_gradio_auto_wrap
-from .gradio import enable_gradio_auto_wrap
-__all__ = [
-    'GPU',
-    'gradio_auto_wrap',
-    'disable_gradio_auto_wrap',
-    'enable_gradio_auto_wrap',
-]

spaces/config.py DELETED Viewed

@@ -1,37 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import os
-from pathlib import Path
-from .utils import boolean
-ZEROGPU_OFFLOAD_DIR_DEFAULT = str(Path.home() / '.zerogpu' / 'tensors')
-class Settings:
-    def __init__(self):
-        self.zero_gpu = boolean(
-            os.getenv('SPACES_ZERO_GPU'))
-        self.zero_device_api_url = (
-            os.getenv('SPACES_ZERO_DEVICE_API_URL'))
-        self.gradio_auto_wrap = boolean(
-            os.getenv('SPACES_GRADIO_AUTO_WRAP'))
-        self.zero_patch_torch_device = boolean(
-            os.getenv('ZERO_GPU_PATCH_TORCH_DEVICE'))
-        self.zero_gpu_v2 = boolean(
-            os.getenv('ZEROGPU_V2'))
-        self.zerogpu_offload_dir = (
-            os.getenv('ZEROGPU_OFFLOAD_DIR', ZEROGPU_OFFLOAD_DIR_DEFAULT))
-Config = Settings()
-if Config.zero_gpu:
-    assert Config.zero_device_api_url is not None, (
-        'SPACES_ZERO_DEVICE_API_URL env must be set '
-        'on ZeroGPU Spaces (identified by SPACES_ZERO_GPU=true)'
-    )

spaces/gradio.py DELETED Viewed

@@ -1,55 +0,0 @@
-"""
-"""
-from __future__ import annotations
-from typing import Callable
-from typing import Generator
-from typing import TypeVar
-from typing import overload
-from typing_extensions import ParamSpec
-from .config import Config
-from .zero.decorator import GPU
-Param = ParamSpec('Param')
-Res = TypeVar('Res')
-gradio_auto_wrap_enabled = Config.gradio_auto_wrap
-def disable_gradio_auto_wrap():
-    global gradio_auto_wrap_enabled
-    gradio_auto_wrap_enabled = False
-def enable_gradio_auto_wrap():
-    global gradio_auto_wrap_enabled
-    gradio_auto_wrap_enabled = True
-@overload
-def gradio_auto_wrap(
-    task:
-     Callable[Param, Res],
-) -> Callable[Param, Res]:
-    ...
-@overload
-def gradio_auto_wrap(
-    task:
-     None,
-) -> None:
-    ...
-def gradio_auto_wrap(
-    task:
-      Callable[Param, Res]
-    | None,
-) -> (Callable[Param, Res]
-    | None):
-    """
-    """
-    if not gradio_auto_wrap_enabled:
-        return task
-    if not callable(task):
-        return task
-    return GPU(task) # type: ignore

spaces/utils.py DELETED Viewed

@@ -1,85 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import ctypes
-import sys
-from functools import lru_cache as cache
-from functools import partial
-import multiprocessing
-from multiprocessing.queues import SimpleQueue as _SimpleQueue
-from pathlib import Path
-from pickle import PicklingError
-from typing import Callable
-from typing import TypeVar
-GRADIO_VERSION_ERROR_MESSAGE = "Make sure Gradio version is at least 3.46"
-T = TypeVar('T')
-@cache
-def self_cgroup_device_path() -> str:
-    cgroup_content = Path('/proc/self/cgroup').read_text()
-    for line in cgroup_content.strip().split('\n'):
-        contents = line.split(':devices:')
-        if len(contents) != 2:
-            continue # pragma: no cover
-        return contents[1]
-    raise Exception # pragma: no cover
-if sys.version_info.minor < 9: # pragma: no cover
-    _SimpleQueue.__class_getitem__ = classmethod(lambda cls, _: cls) # type: ignore
-class SimpleQueue(_SimpleQueue[T]):
-    def __init__(self, *args):
-        super().__init__(*args, ctx=multiprocessing.get_context('fork'))
-    def put(self, obj: T):
-        try:
-            super().put(obj)
-        except PicklingError:
-            raise # pragma: no cover
-        # https://bugs.python.org/issue29187
-        except Exception as e:
-            message = str(e)
-            if not "pickle" in message:
-                raise # pragma: no cover
-            raise PicklingError(message)
-    def close(self): # Python 3.8 static typing trick
-        super().close() # type: ignore
-    def wlock_release(self):
-        if (lock := getattr(self, '_wlock', None)) is None:
-            return # pragma: no cover
-        try:
-            lock.release()
-        except ValueError:
-            pass
-def drop_params(fn: Callable[[], T]) -> Callable[..., T]:
-    def drop(*args):
-        return fn()
-    return drop
-def boolean(value: str | None) -> bool:
-    return value is not None and value.lower() in ("1", "t", "true")
-def gradio_request_var():
-    try:
-        from gradio.context import LocalContext
-    except ImportError: # pragma: no cover
-        raise RuntimeError(GRADIO_VERSION_ERROR_MESSAGE)
-    return LocalContext.request
-def malloc_trim():
-    ctypes.CDLL("libc.so.6").malloc_trim(0)
-debug = partial(print, 'SPACES_ZERO_GPU_DEBUG')

spaces/zero/__init__.py DELETED Viewed

@@ -1,21 +0,0 @@
-"""
-"""
-from pathlib import Path
-from ..config import Config
-if Config.zero_gpu:
-    from . import gradio
-    from . import torch
-    if torch.is_in_bad_fork():
-        raise RuntimeError(
-            "CUDA has been initialized before importing the `spaces` package"
-        )
-    torch.patch()
-    gradio.one_launch(torch.pack)
-    Path(Config.zerogpu_offload_dir).mkdir(parents=True, exist_ok=True)

spaces/zero/api.py DELETED Viewed

@@ -1,156 +0,0 @@
-"""
-Synced with huggingface/pyspaces:spaces/zero/api.py
-"""
-from __future__ import annotations
-from datetime import timedelta
-from typing import Any
-from typing import Generator
-from typing import Literal
-from typing import NamedTuple
-from typing import Optional
-from typing import overload
-import httpx
-from pydantic import BaseModel
-from typing_extensions import assert_never
-AllowToken = str
-NvidiaIndex = int # TODO: Migrate to GpuIndex (less confusing for MIG)
-NvidiaUUID = str
-CGroupPath = str
-VisitorId = str
-Score = float
-AuthLevel = Literal['regular', 'pro']
-AUTHENTICATED_HEADER = 'X-Authenticated'
-class ScheduleResponse(BaseModel):
-    idle: bool
-    nvidiaIndex: int
-    nvidiaUUID: str
-    allowToken: str
-class QuotaInfos(BaseModel):
-    left: int
-    wait: timedelta
-class ReportUsageMonitoringParams(NamedTuple):
-    nvidia_index: int
-    visitor_id: str
-    duration: timedelta
-class QueueEvent(BaseModel):
-    event: Literal['ping', 'failed', 'succeeded']
-    data: Optional[ScheduleResponse] = None
-def sse_parse(text: str):
-    event, *data = text.strip().splitlines()
-    assert event.startswith('event:')
-    event = event[6:].strip()
-    if event in ('ping', 'failed'):
-        return QueueEvent(event=event)
-    assert event == 'succeeded'
-    (data,) = data
-    assert data.startswith('data:')
-    data = data[5:].strip()
-    return QueueEvent(event=event, data=ScheduleResponse.parse_raw(data))
-def sse_stream(res: httpx.Response) -> Generator[QueueEvent, Any, None]:
-    for text in res.iter_text():
-        if len(text) == 0:
-            break # pragma: no cover
-        try:
-            yield sse_parse(text)
-        except GeneratorExit:
-            res.close()
-            break
-class APIClient:
-    def __init__(self, client: httpx.Client):
-        self.client = client
-    def startup_report(self) -> httpx.codes:
-        res = self.client.post('/startup-report')
-        return httpx.codes(res.status_code)
-    def schedule(
-        self,
-        cgroup_path: str,
-        task_id: int = 0,
-        token: str | None = None,
-        duration_seconds: int | None = None,
-        enable_queue: bool = True,
-    ):
-        params: dict[str, str | int | bool] = {
-            'cgroupPath': cgroup_path,
-            'taskId': task_id,
-            'enableQueue': enable_queue,
-        }
-        if duration_seconds is not None:
-            params['durationSeconds'] = duration_seconds
-        if token is not None:
-            params['token'] = token
-        res = self.client.send(
-            request=self.client.build_request(
-                method='POST',
-                url='/schedule',
-                params=params,
-            ),
-            stream=True,
-        )
-        status = httpx.codes(res.status_code)
-        auth: AuthLevel | None = res.headers.get(AUTHENTICATED_HEADER)
-        if (status is not httpx.codes.OK and
-            status is not httpx.codes.TOO_MANY_REQUESTS
-        ):
-            res.close()
-            return status, auth
-        if "text/event-stream" in res.headers['content-type']:
-            return sse_stream(res), auth
-        res.read()
-        if status is httpx.codes.TOO_MANY_REQUESTS:
-            return QuotaInfos(**res.json()), auth # pragma: no cover
-        if status is httpx.codes.OK:
-            return ScheduleResponse(**res.json()), auth
-        assert_never(status)
-    def allow(
-        self,
-        allow_token: str,
-        pid: int,
-    ):
-        res = self.client.post('/allow', params={
-            'allowToken': allow_token,
-            'pid': pid,
-        })
-        return httpx.codes(res.status_code)
-    def release(
-        self,
-        allow_token: str,
-        fail: bool = False,
-    ) -> httpx.codes:
-        res = self.client.post('/release', params={
-            'allowToken': allow_token,
-            'fail': fail,
-        })
-        return httpx.codes(res.status_code)
-    def get_queue_size(self) -> int:
-        res = self.client.get('/queue-size')
-        assert res.status_code == 200, res.status_code
-        size = res.json()
-        assert isinstance(size, int)
-        return size

spaces/zero/client.py DELETED Viewed

@@ -1,239 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import os
-import time
-import warnings
-from datetime import timedelta
-import gradio as gr
-import httpx
-from packaging import version
-from typing_extensions import assert_never
-from .. import utils
-from ..config import Config
-from .api import APIClient
-from .api import AuthLevel
-from .api import QuotaInfos
-from .api import ScheduleResponse
-from .gradio import HTMLError
-from .gradio import get_event
-from .gradio import supports_auth
-TOKEN_HEADER = 'X-IP-Token'
-DEFAULT_SCHEDULE_DURATION = 60
-QUOTA_MESSAGE = "You have exceeded your GPU quota"
-UNUSED_MESSAGE = "GPU device not used"
-NO_GPU_MESSAGE_REGULAR = "No GPU was available"
-NO_GPU_MESSAGE_INQUEUE = "No GPU was available after 60s"
-SIGNUP_ON_HF_TXT = "Create a free account"
-SIGNUP_ON_HF_URL = "https://huggingface.co/join"
-SUBSCRIBE_TO_PRO_TXT = "Subscribe to Pro"
-SUBSCRIBE_TO_PRO_URL = "https://huggingface.co/settings/billing/subscription"
-def api_client():
-    assert Config.zero_device_api_url is not None
-    httpx_client = httpx.Client(base_url=Config.zero_device_api_url, timeout=60, verify=False)
-    return APIClient(httpx_client)
-def startup_report():
-    retries, max_retries = 0, 2
-    client = api_client()
-    while (status := client.startup_report()) is httpx.codes.NOT_FOUND: # pragma: no cover
-        time.sleep(1)
-        if (retries := retries + 1) > max_retries:
-            raise RuntimeError("Error while initializing ZeroGPU: NotFound")
-    if status is not httpx.codes.OK: # pragma: no cover
-        raise RuntimeError("Error while initializing ZeroGPU: Unknown")
-def html_string(html_contents: str, text_contents: str): # pragma: no cover
-    class HTMLString(str):
-        def __str__(self):
-            return text_contents
-    return HTMLString(html_contents)
-def _toast_action(
-    auth: AuthLevel | None,
-    supports_html: bool,
-    pro_message: str,
-    unlogged_desc: str,
-    logged_desc: str,
-    ending: str,
-) -> tuple[str, str]: # pragma: no cover
-    if not supports_auth() or auth == 'pro':
-        return pro_message, pro_message
-    html = ""
-    link = SIGNUP_ON_HF_URL if auth is None else SUBSCRIBE_TO_PRO_URL
-    text = SIGNUP_ON_HF_TXT if auth is None else SUBSCRIBE_TO_PRO_TXT
-    desc = unlogged_desc if auth is None else logged_desc
-    desc += f" {ending}."
-    style = ";".join([
-        "white-space: nowrap",
-        "text-underline-offset: 2px",
-        "color: var(--body-text-color)",
-    ])
-    if supports_html:
-        html += f'<a style="{style}" href="{link}">'
-    html += text
-    if supports_html:
-        html += '</a> '
-    html += desc
-    markdown = f'[{text}]({link}) {desc}'
-    return html, markdown
-def schedule(
-    task_id: int,
-    request: gr.Request | None = None,
-    duration: timedelta | None = None,
-    _first_attempt: bool = True,
-) -> ScheduleResponse:
-    if not (gradio_version := version.parse(gr.__version__)).major >= 4: # pragma: no cover
-        raise RuntimeError("ZeroGPU is only compatible with Gradio 4+")
-    GRADIO_HTML_TOASTS = gradio_version.minor >= 39
-    res, auth = api_client().schedule(
-        cgroup_path=utils.self_cgroup_device_path(),
-        task_id=task_id,
-        token=_get_token(request),
-        duration_seconds=duration.seconds if duration is not None else None,
-    )
-    if isinstance(res, ScheduleResponse):
-        return res
-    if isinstance(res, QuotaInfos): # pragma: no cover
-        requested = duration.seconds if duration is not None else DEFAULT_SCHEDULE_DURATION
-        if res.wait < timedelta(0):
-            raise gr.Error(
-                f"The requested GPU duration ({requested}s) "
-                f"is larger than the maximum allowed"
-            )
-        else:
-            gpu = "Pro GPU" if auth == 'pro' else ("free GPU" if auth == 'regular' else "GPU")
-            message = (
-                f"You have exceeded your {gpu} quota "
-                f"({requested}s requested vs. {res.left}s left)."
-            )
-            details_html, details_markdown = _toast_action(
-                auth=auth,
-                supports_html=GRADIO_HTML_TOASTS,
-                pro_message=f"Try again in {res.wait}",
-                unlogged_desc="to get more",
-                logged_desc="to get 5x more",
-                ending="usage quota",
-            )
-            message_html = f"{message} {details_html}"
-            message_text = f"{message} {details_markdown}"
-            raise HTMLError(html_string(message_html, message_text))
-    if not isinstance(res, httpx.codes): # pragma: no cover
-        gr.Info("Waiting for a GPU to become available")
-        # TODO: Sign-up message if not authenticated (after some time ?)
-        connection_event = get_event()
-        if connection_event is None and request is not None:
-            warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
-        while True:
-            try:
-                event = next(res)
-            except StopIteration:
-                raise RuntimeError("Unexpected end of stream")
-            except httpx.RemoteProtocolError:
-                if not _first_attempt:
-                    raise RuntimeError("Error while re-trying after queue disconnect")
-                return schedule(task_id, request, duration, _first_attempt=False)
-            if event.event == 'ping':
-                if connection_event is not None and not connection_event.alive:
-                    res.close()
-                    raise RuntimeError("Connection closed by visitor while queueing")
-                continue
-            if event.event == 'failed':
-                details_html, details_markdown = _toast_action(
-                    auth=auth,
-                    supports_html=GRADIO_HTML_TOASTS,
-                    pro_message="Retry later",
-                    unlogged_desc="to get a higher",
-                    logged_desc="to get the highest",
-                    ending="priority in ZeroGPU queues",
-                )
-                message_html = f"{NO_GPU_MESSAGE_INQUEUE}. {details_html}"
-                message_text = f"{NO_GPU_MESSAGE_INQUEUE} {details_markdown}"
-                raise HTMLError(html_string(message_html, message_text))
-            if event.event == 'succeeded':
-                assert event.data is not None
-                if connection_event is not None and not connection_event.alive:
-                    release(event.data.allowToken)
-                    raise RuntimeError("Connection closed by visitor on queue success")
-                gr.Info("Successfully acquired a GPU")
-                return event.data
-    if res is httpx.codes.SERVICE_UNAVAILABLE:
-        raise gr.Error(NO_GPU_MESSAGE_REGULAR)
-    # TODO: Find a way to log 'detail' response field
-    raise RuntimeError(f"ZeroGPU API /schedule error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
-def allow(allow_token: str) -> None:
-    pid = os.getpid()
-    assert pid != 1, "Allowing PID 1 on ZeroGPU will end up killing your Space"
-    assert api_client().allow(allow_token=allow_token, pid=pid) is httpx.codes.OK
-def release(
-    allow_token: str, *,
-    fail: bool = False,
-    allow_404: bool = False,
-) -> None:
-    res = api_client().release(
-        allow_token=allow_token,
-        fail=fail,
-    )
-    if res is httpx.codes.NO_CONTENT: # pragma: no cover
-        try:
-            gr.Warning(UNUSED_MESSAGE)
-        except AttributeError:
-            pass
-        warnings.warn(UNUSED_MESSAGE, RuntimeWarning)
-        return None
-    if res is httpx.codes.NOT_FOUND:
-        if not allow_404:
-            warnings.warn("ZeroGPU API /release warning: 404 Not Found")
-        return None
-    if httpx.codes.is_success(res):
-        return None
-    # TODO: Find a way to log 'detail' response field
-    # TODO: Only raise in dev environment. Simply warn in production ?
-    raise RuntimeError(f"ZeroGPU API /release error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
-def _get_token(request: gr.Request | None) -> str | None:
-    if request is None:
-        return None
-    headers = getattr(request, 'headers', None)
-    if headers is None or not hasattr(headers, '__dict__'):
-        raise gr.Error("Internal Gradio error")
-    # Compatibility trick
-    if not hasattr(headers, 'get'):
-        headers = headers.__dict__ # pragma: no cover
-    return headers.get(TOKEN_HEADER.lower())

spaces/zero/decorator.py DELETED Viewed

@@ -1,113 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import inspect
-import sys
-import warnings
-from datetime import timedelta
-from functools import partial
-from typing import Callable
-from typing import TypeVar
-from typing import overload
-from typing_extensions import ParamSpec
-from typing_extensions import Unpack
-from ..config import Config
-from .types import DynamicDuration
-from .types import EmptyKwargs
-P = ParamSpec('P')
-R = TypeVar('R')
-decorated_cache: dict[Callable, Callable] = {}
-@overload
-def GPU(
-    task: None = None, *,
-    duration: DynamicDuration[P] = None,
-) -> Callable[[Callable[P, R]], Callable[P, R]]:
-    ...
-@overload
-def GPU(
-    task: Callable[P, R], *,
-    duration: DynamicDuration[P] = None,
-) -> Callable[P, R]:
-    ...
-def GPU(
-    task: Callable[P, R] | None = None, *,
-    duration: DynamicDuration[P] = None,
-    **kwargs: Unpack[EmptyKwargs],
-) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]:
-    """
-    ZeroGPU decorator
-    Basic usage:
-        ```
-        @spaces.GPU
-        def fn(...):
-            # CUDA is available here
-            pass
-        ```
-    With custom duration:
-        ```
-        @spaces.GPU(duration=45) # Expressed in seconds
-        def fn(...):
-            # CUDA is available here
-            pass
-        ```
-    Args:
-        task (`Callable | None`): Python function that requires CUDA
-        duration (`int | datetime.timedelta`): Estimated duration in seconds or `datetime.timedelta`
-    Returns:
-        `Callable`: GPU-ready function
-    """
-    if "enable_queue" in kwargs:
-        warnings.warn("`enable_queue` parameter is now ignored and always set to `True`")
-    if task is None:
-        return partial(_GPU, duration=duration)
-    return _GPU(task, duration)
-def _GPU(
-    task: Callable[P, R],
-    duration: DynamicDuration[P],
-) -> Callable[P, R]:
-    if not Config.zero_gpu:
-        return task
-    from . import client
-    from .wrappers import regular_function_wrapper
-    from .wrappers import generator_function_wrapper
-    if sys.version_info.minor < 9: # pragma: no cover
-        raise RuntimeError("Actually using @spaces.GPU on a ZeroGPU Space requires Python 3.9+")
-    if task in decorated_cache:
-        # TODO: Assert same duration ?
-        return decorated_cache[task] # type: ignore
-    if inspect.iscoroutinefunction(task):
-        raise NotImplementedError
-    if inspect.isgeneratorfunction(task):
-        decorated = generator_function_wrapper(task, duration)
-    else:
-        decorated = regular_function_wrapper(task, duration)
-    setattr(decorated, 'zerogpu', None)
-    client.startup_report()
-    decorated_cache.update({
-        task:      decorated,
-        decorated: decorated,
-    })
-    return decorated # type: ignore

spaces/zero/gradio.py DELETED Viewed

@@ -1,150 +0,0 @@
-"""
-"""
-from __future__ import annotations
-from functools import wraps
-from packaging import version
-from typing import Callable
-from typing import NamedTuple
-from typing import TYPE_CHECKING
-import warnings
-import gradio as gr
-from gradio.context import Context
-from gradio.context import LocalContext
-from gradio.helpers import Progress
-from gradio.helpers import TrackedIterable
-from gradio.queueing import Queue
-from typing_extensions import ParamSpec
-from ..utils import SimpleQueue
-from .types import GeneratorResQueueResult
-from .types import GradioQueueEvent
-from .types import RegularResQueueResult
-QUEUE_RPC_METHODS = [
-    "set_progress",
-    "log_message",
-]
-class GradioPartialContext(NamedTuple):
-    event_id: str | None
-    in_event_listener: bool
-    progress: Progress | None
-    @staticmethod
-    def get():
-        TrackedIterable.__reduce__ = tracked_iterable__reduce__
-        return GradioPartialContext(
-            event_id=LocalContext.event_id.get(),
-            in_event_listener=LocalContext.in_event_listener.get(),
-            progress=LocalContext.progress.get(),
-        )
-    @staticmethod
-    def apply(context: 'GradioPartialContext'):
-        LocalContext.event_id.set(context.event_id)
-        LocalContext.in_event_listener.set(context.in_event_listener)
-        LocalContext.progress.set(context.progress)
-def get_queue_instance():
-    blocks = LocalContext.blocks.get()
-    if blocks is None: # pragma: no cover
-        return None
-    return blocks._queue
-def get_event():
-    queue = get_queue_instance()
-    event_id = LocalContext.event_id.get()
-    if queue is None:
-        return None
-    if event_id is None: # pragma: no cover
-        return None
-    for job in queue.active_jobs:
-        if job is None: # pragma: no cover
-            continue
-        for event in job:
-            if event._id == event_id:
-                return event
-def get_server_port() -> int | None:
-    from_request_context = True
-    if (blocks := LocalContext.blocks.get()) is None: # Request
-        from_request_context = False
-        if (blocks := Context.root_block) is None: # Caching
-            return None
-    if (server := getattr(blocks, 'server', None)) is None:
-        if from_request_context:
-            warnings.warn("Gradio: No blocks.server inside a request") # pragma: no cover
-        return -1
-    if TYPE_CHECKING:
-        assert (server := blocks.server)
-    return server.config.port
-def try_process_queue_event(method_name: str, *args, **kwargs):
-    queue = get_queue_instance()
-    if queue is None: # pragma: no cover
-        warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
-        return
-    method = getattr(queue, method_name, None)
-    assert callable(method)
-    method(*args, **kwargs)
-def patch_gradio_queue(
-    res_queue: SimpleQueue[RegularResQueueResult | None] | SimpleQueue[GeneratorResQueueResult | None],
-):
-    def rpc_method(method_name: str):
-        def method(*args, **kwargs):
-            if args and isinstance(args[0], Queue):
-                args = args[1:] # drop `self`
-            res_queue.put(GradioQueueEvent(method_name, args, kwargs))
-        return method
-    for method_name in QUEUE_RPC_METHODS:
-        if (method := getattr(Queue, method_name, None)) is None: # pragma: no cover
-            warnings.warn(f"ZeroGPU: Gradio Queue has no {method_name} attribute")
-            continue
-        if not callable(method): # pragma: no cover
-            warnings.warn(f"ZeroGPU: Gradio Queue {method_name} is not callable")
-            continue
-        setattr(Queue, method_name, rpc_method(method_name))
-    TrackedIterable.__reduce__ = tracked_iterable__reduce__
-def tracked_iterable__reduce__(self):
-    res: tuple = super(TrackedIterable, self).__reduce__() # type: ignore
-    cls, base, state, *_ = res
-    return cls, base,{**state, **{
-        'iterable': None,
-        '_tqdm': None,
-    }}
-def supports_auth():
-    return version.parse(gr.__version__) >= version.Version('4.27.0')
-Param = ParamSpec('Param')
-def one_launch(task: Callable[Param, None], *task_args: Param.args, **task_kwargs: Param.kwargs):
-    _launch = gr.Blocks.launch
-    @wraps(gr.Blocks.launch)
-    def launch(*args, **kwargs):
-        task(*task_args, **task_kwargs)
-        gr.Blocks.launch = _launch
-        return gr.Blocks.launch(*args, **kwargs)
-    gr.Blocks.launch = launch
-class HTMLError(gr.Error):
-    def __str__(self): # pragma: no cover
-        return self.message

spaces/zero/torch/__init__.py DELETED Viewed

@@ -1,42 +0,0 @@
-"""
-"""
-from ...config import Config
-try:
-    import torch
-except ImportError:
-    _patch = lambda *args, **kwargs: None
-    _unpatch = lambda *args, **kwargs: None
-    _pack = lambda *args, **kwargs: None
-    _init = lambda *args, **kwargs: None
-    _size = lambda *args, **kwargs: 0
-    _move = lambda *args, **kwargs: None
-    _is_in_bad_fork = lambda *args, **kwargs: False
-else:
-    if Config.zero_gpu_v2:
-        from . import patching as _patching
-    else: # pragma: no cover
-        from . import patching_legacy as _patching
-    _patch = _patching.patch
-    _unpatch = _patching.unpatch
-    _pack = _patching.pack
-    _init = _patching.init
-    _size = _patching.size
-    _move = _patching.move
-    _is_in_bad_fork = _patching.is_in_bad_fork
-patch = _patch
-unpatch = _unpatch
-pack = _pack
-init = _init
-size = _size
-move = _move
-is_in_bad_fork = _is_in_bad_fork

spaces/zero/torch/bitsandbytes.py DELETED Viewed

@@ -1,162 +0,0 @@
-"""
-"""
-# pyright: reportPrivateImportUsage=false
-from __future__ import annotations
-import importlib
-from contextlib import contextmanager
-from importlib import metadata
-from types import ModuleType
-from typing import TYPE_CHECKING
-from typing import Tuple
-import torch
-from packaging import version
-if TYPE_CHECKING:
-    import torch as Torch
-@contextmanager
-def cuda_unavailable(torch: ModuleType):
-    _is_available = torch.cuda.is_available
-    torch.cuda.is_available = lambda: False
-    yield
-    torch.cuda.is_available = _is_available
-def maybe_import_bitsandbytes():
-    try:
-        import torch
-    except ImportError: # pragma: no cover
-        return None
-    with cuda_unavailable(torch):
-        try:
-            import bitsandbytes
-        except ImportError:
-            bitsandbytes = None
-        else:
-            if (bnb_version := version.parse(metadata.version('bitsandbytes'))) < version.parse('0.40.0'):
-                raise RuntimeError(f"ZeroGPU requires bitsandbytes >= 0.40.0 (installed: {bnb_version})") # pragma: no cover
-            print("↑ Those bitsandbytes warnings are expected on ZeroGPU ↑")
-    return bitsandbytes
-if (bnb := maybe_import_bitsandbytes()):
-    from torch.utils.weak import WeakTensorKeyDictionary
-    with cuda_unavailable(torch):
-        from bitsandbytes import cextension
-        from bitsandbytes import functional
-        try: # bitsandbytes < 0.44
-            from bitsandbytes.cuda_setup.main import CUDASetup
-        except ModuleNotFoundError: # pragma: no cover
-            CUDASetup = None
-        from bitsandbytes.nn import Int8Params
-        from bitsandbytes.nn import Params4bit
-    _param_to_8bit   = Int8Params.to     # type: ignore
-    _param_cuda_8bit = Int8Params.cuda
-    _param_to_4bit   = Params4bit.to     # type: ignore
-    _param_cuda_4bit = Params4bit.cuda
-    TensorToArgs = Tuple[torch.device, torch.dtype, bool, torch.memory_format]
-    to_ops_8bit: dict[Int8Params, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
-    to_ops_4bit: dict[Params4bit, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
-    def _to_op_register_8bit(self: Int8Params, *args, **kwargs):
-        parsed = torch._C._nn._parse_to(*args, **kwargs)
-        device, *_ = parsed
-        if not isinstance(device, torch.device): # pragma: no cover
-            return _param_to_8bit(self, *args, **kwargs)
-        if device.type != 'cuda':
-            return _param_to_8bit(self, *args, **kwargs)
-        to_ops_8bit[self] = parsed
-        return self
-    def _to_op_register_4bit(self: Params4bit, *args, **kwargs):
-        parsed = torch._C._nn._parse_to(*args, **kwargs)
-        device, *_ = parsed
-        if not isinstance(device, torch.device): # pragma: no cover
-            return _param_to_4bit(self, *args, **kwargs)
-        if device.type != 'cuda':
-            return _param_to_4bit(self, *args, **kwargs)
-        to_ops_4bit[self] = parsed
-        return self
-    def _cuda_op_arg_check(device: Torch.device | int | str | None) -> bool:
-        if device is None: # pragma: no cover
-            return True
-        if isinstance(device, int):
-            return True
-        if isinstance(device, str): # pragma: no cover
-            device = torch.device(device)
-        return device.type == 'cuda' # pragma: no cover
-    def _cuda_op_register_8bit(self: Int8Params, device: Torch.device | int | str | None = None, **kwargs):
-        if not _cuda_op_arg_check(device): # pragma: no cover
-            # Let PyTorch handle the fail
-            return _param_cuda_8bit(self, device, **kwargs)
-        to_ops_8bit[self] = None
-        return self
-    def _cuda_op_register_4bit(self: Params4bit, device: Torch.device | int | str | None = None, **kwargs):
-        if not _cuda_op_arg_check(device): # pragma: no cover
-            # Let PyTorch handle the fail
-            return _param_cuda_4bit(self, device, **kwargs)
-        to_ops_4bit[self] = None
-        return self
-    def _patch():
-        Int8Params.to   = _to_op_register_8bit   # type: ignore
-        Int8Params.cuda = _cuda_op_register_8bit # type: ignore
-        Params4bit.to   = _to_op_register_4bit   # type: ignore
-        Params4bit.cuda = _cuda_op_register_4bit # type: ignore
-    def _unpatch():
-        Int8Params.to   = _param_to_8bit   # type: ignore
-        Int8Params.cuda = _param_cuda_8bit
-        Params4bit.to   = _param_to_4bit   # type: ignore
-        Params4bit.cuda = _param_cuda_4bit
-    def _move():
-        if CUDASetup is not None:
-            CUDASetup._instance = None
-        importlib.reload(cextension)
-        functional.lib = cextension.lib
-        for op in to_ops_8bit.items():
-            tensor, parsed_args = op
-            if parsed_args:
-                _, dtype, _, memory_format = parsed_args
-            else:
-                dtype, memory_format = None, None
-            tensor.data = _param_to_8bit(tensor,
-                device='cuda',
-                dtype=dtype,
-                memory_format=memory_format,
-            ) # type: ignore
-        for op in to_ops_4bit.items():
-            tensor, parsed_args = op
-            if parsed_args:
-                _, dtype, _, memory_format = parsed_args
-            else:
-                dtype, memory_format = None, None
-            tensor.data = _param_to_4bit(tensor,
-                device='cuda',
-                dtype=dtype,
-                memory_format=memory_format,
-            ) # type: ignore
-else:
-    _patch = lambda: None
-    _unpatch = lambda: None
-    _move = lambda: None
-patch = _patch
-unpatch = _unpatch
-move = _move

spaces/zero/torch/packing.py DELETED Viewed

@@ -1,209 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import time
-import ctypes
-import os
-from concurrent.futures import as_completed
-from concurrent.futures import ThreadPoolExecutor
-from contextvars import copy_context
-from dataclasses import dataclass
-from queue import Queue
-from typing import Callable
-from ...utils import debug
-import torch
-from typing_extensions import TypeAlias
-PAGE_SIZE = 4096
-TOTAL_MEMORY = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
-VM_MAX_SIZE = min(2**38, TOTAL_MEMORY // 2)
-BUFFER_SIZE = 64 * 2**20
-BUFFER_COUNT = 2
-TensorWithSizes: TypeAlias = 'tuple[torch.Tensor, int, int]'
-@dataclass
-class ZeroGPUTensorPack:
-    base_dir: str
-    batches: list[list[TensorWithSizes]]
-    big_tensors: list[TensorWithSizes]
-    fakes: dict[torch.Tensor, list[torch.Tensor]]
-    total_size: int
-    def path(self):
-        return f'{self.base_dir}/{id(self)}'
-    def __del__(self):
-        try:
-            os.remove(self.path())
-        except FileNotFoundError: # pragma: no cover
-            pass
-def write(fd: int, tensor: torch.Tensor):
-    clone = torch.empty_like(tensor)
-    size = clone.untyped_storage().size() # pyright: ignore [reportAttributeAccessIssue]
-    buffer = torch.UntypedStorage(VM_MAX_SIZE)
-    buffer_ptr = buffer.data_ptr()
-    offset = -buffer_ptr % PAGE_SIZE
-    padding = -size % PAGE_SIZE
-    clone.set_(buffer[offset:offset+size], 0, clone.shape, clone.stride()) # pyright: ignore [reportArgumentType]
-    clone.copy_(tensor)
-    mv = memoryview((ctypes.c_char * (size+padding)).from_address(buffer_ptr+offset))
-    written_bytes = 0
-    while written_bytes < size:
-        written_bytes += os.write(fd, mv[written_bytes:])
-def pack_tensors(
-    tensors: set[torch.Tensor],
-    fakes: dict[torch.Tensor, list[torch.Tensor]],
-    offload_dir: str,
-    callback: Callable[[int]] | None = None,
-):
-    callback = (lambda bytes: None) if callback is None else callback
-    batches: list[list[TensorWithSizes]] = []
-    big_tensors: list[TensorWithSizes] = []
-    tensors_with_sizes: list[tuple[torch.Tensor, int, int]] = []
-    for tensor in tensors:
-        size = tensor.numel() * tensor.element_size()
-        aligned_size = size + (-size % PAGE_SIZE)
-        tensors_with_sizes += [(tensor, size, aligned_size)]
-    current_batch, current_size = [], 0
-    for (tensor, size, aligned_size) in sorted(tensors_with_sizes, key=lambda item: item[2]):
-        if aligned_size > BUFFER_SIZE:
-            big_tensors += [(tensor, size, aligned_size)]
-            continue
-        current_size += aligned_size
-        if current_size > BUFFER_SIZE:
-            batches += [current_batch]
-            current_batch, current_size = [(tensor, size, aligned_size)], aligned_size
-        else:
-            current_batch += [(tensor, size, aligned_size)]
-    if current_batch:
-        batches += [current_batch]
-    get_meta = {tensor: torch.empty_like(tensor) for tensor in tensors}
-    batches_meta = [[(get_meta[tensor], size, asize) for tensor, size, asize in batch] for batch in batches]
-    big_tensors_meta = [(get_meta[tensor], size, asize) for tensor, size, asize in big_tensors]
-    fakes_meta = {get_meta[tensor]: fake_list for tensor, fake_list in fakes.items()}
-    pack = ZeroGPUTensorPack(
-        base_dir=offload_dir,
-        batches=batches_meta,
-        big_tensors=big_tensors_meta,
-        fakes=fakes_meta,
-        total_size=sum([size for _, size, _ in tensors_with_sizes]),
-    )
-    fd = os.open(pack.path(), os.O_CREAT | os.O_WRONLY | os.O_DIRECT)
-    try:
-        total_asize = sum([aligned_size for batch in batches for *_, aligned_size in batch])
-        total_asize += sum([aligned_size for *_, aligned_size in big_tensors])
-        if total_asize > 0:
-            os.posix_fallocate(fd, 0, total_asize)
-            for batch in batches:
-                for tensor, size, _ in batch:
-                    write(fd, tensor)
-                    callback(size)
-            for tensor, size, _ in big_tensors:
-                write(fd, tensor)
-                callback(size)
-        return pack
-    finally:
-        os.close(fd)
-def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = None):
-    callback = (lambda bytes: None) if callback is None else callback
-    free_buffers: Queue[torch.Tensor] = Queue()
-    read_buffers: Queue[torch.Tensor] = Queue()
-    for _ in range(BUFFER_COUNT):
-        free_buffers.put(torch.ByteTensor(BUFFER_SIZE).pin_memory())
-    def read(fd: int, buffer: torch.Tensor, size: int):
-        mv = memoryview((ctypes.c_char * size).from_address(buffer.data_ptr()))
-        read_bytes = 0
-        while read_bytes < size:
-            read_bytes += os.readv(fd, [mv[read_bytes:]])
-    def disk_to_pin(fd: int):
-        for batch in pack.batches:
-            buffer = free_buffers.get()
-            batch_size = sum([aligned_size for *_, aligned_size in batch])
-            read(fd, buffer, batch_size)
-            read_buffers.put(buffer)
-        for *_, aligned_size in pack.big_tensors:
-            read_bytes = 0
-            while read_bytes < aligned_size:
-                buffer = free_buffers.get()
-                read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
-                read(fd, buffer, read_size)
-                read_buffers.put(buffer)
-                read_bytes += read_size
-    def pin_to_cuda():
-        total_duration_in_callback = 0
-        for batch in pack.batches:
-            buffer = read_buffers.get()
-            offset = 0
-            cuda_storages = []
-            for tensor, size, aligned_size in batch:
-                cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
-                offset += aligned_size
-            torch.cuda.synchronize()
-            free_buffers.put(buffer)
-            batch_total_size = 0
-            for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
-                cuda_tensor = torch.tensor([], dtype=tensor.dtype, device='cuda')
-                cuda_tensor = cuda_tensor.set_(cuda_storage.untyped_storage(), 0, tensor.shape, tensor.stride())
-                for fake in pack.fakes[tensor]:
-                    fake.data = cuda_tensor
-                batch_total_size += size
-            t0 = time.perf_counter()
-            callback(batch_total_size)
-            total_duration_in_callback += time.perf_counter() - t0
-        for tensor, size, _ in pack.big_tensors:
-            cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
-            offset = 0
-            while offset < size:
-                buffer = read_buffers.get()
-                read_size = min(BUFFER_SIZE, size - offset)
-                cuda_storage[offset:offset+read_size] = buffer[:read_size]
-                offset += read_size
-                torch.cuda.synchronize() # Probably not needed
-                free_buffers.put(buffer)
-                t0 = time.perf_counter()
-                callback(read_size)
-                total_duration_in_callback += time.perf_counter() - t0
-            cuda_tensor = torch.tensor([], dtype=tensor.dtype, device='cuda')
-            cuda_tensor = cuda_tensor.set_(cuda_storage.untyped_storage(), 0, tensor.shape, tensor.stride())
-            for fake in pack.fakes[tensor]:
-                fake.data = cuda_tensor
-        debug(f"{total_duration_in_callback=}")
-    with ThreadPoolExecutor(2) as e:
-        fd = os.open(pack.path(), os.O_RDONLY | os.O_DIRECT)
-        try:
-            futures = [
-                e.submit(copy_context().run, disk_to_pin, fd),
-                e.submit(copy_context().run, pin_to_cuda),
-            ]
-            for future in as_completed(futures):
-                future.result()
-        finally:
-            os.close(fd)

spaces/zero/torch/patching.py DELETED Viewed

@@ -1,386 +0,0 @@
-"""
-"""
-# pyright: reportPrivateImportUsage=false
-from __future__ import annotations
-import gc
-import multiprocessing
-import os
-from collections import defaultdict
-from concurrent.futures import ProcessPoolExecutor
-from concurrent.futures import ThreadPoolExecutor
-from contextlib import nullcontext
-from contextvars import copy_context
-from types import SimpleNamespace
-from typing import Any
-from typing import Callable
-import torch
-from torch.overrides import TorchFunctionMode
-from torch.overrides import resolve_name
-from torch.utils._python_dispatch import TorchDispatchMode
-from torch.utils._pytree import tree_map_only
-from torch.utils.weak import WeakTensorKeyDictionary
-from ...config import Config
-from ...utils import malloc_trim
-from ..tqdm import tqdm
-from . import bitsandbytes
-from .packing import ZeroGPUTensorPack
-from .packing import pack_tensors
-from .packing import pack_to_cuda
-from .types import AliasId
-# Nvidia A100.80G MIG (drivers 535) / Torch 2.2.0
-CUDA_DEVICE_NAME = 'NVIDIA A100-SXM4-80GB MIG 3g.40gb'
-CUDA_TOTAL_MEMORY = 42144366592
-CUDA_MEM_GET_INFO = (41911451648, CUDA_TOTAL_MEMORY)
-CUDA_DEVICE_CAPABILITY = (8, 0)
-CUDA_DEVICE_PROPERTIES = SimpleNamespace(name=CUDA_DEVICE_NAME, major=8, minor=0, total_memory=CUDA_TOTAL_MEMORY, multi_processor_count=42)
-OPS_INPUTS_CHECK_NO_RETURN = (
-    torch.Tensor.equal,
-)
-OPS_INPUT_CHECK_SELF_RETURN = (
-    torch.Tensor.set_, # probably never dispatched
-    torch.ops.aten.set_.source_Tensor, # pyright: ignore [reportAttributeAccessIssue]
-)
-OFFLOADED_ERROR_MESSAGE = "Cannot apply function {} on disk-offloaded Tensor {}"
-_tensor_make_subclass = torch.Tensor._make_subclass
-_asarray           = torch.asarray
-_cuda_init         = torch._C._cuda_init
-_cuda_exchange_device = torch.cuda._exchange_device
-_cuda_available      = torch.cuda.is_available
-_cuda_device_count   = torch.cuda.device_count
-_cuda_current_device = torch.cuda.current_device
-_cuda_mem_get_info   = torch.cuda.mem_get_info
-_cuda_get_device_capability   = torch.cuda.get_device_capability
-_cuda_get_device_properties   = torch.cuda.get_device_properties
-_cuda_get_device_name         = torch.cuda.get_device_name
-# PyTorch 2.3
-_cuda_maybe_exchange_device = getattr(torch.cuda, '_maybe_exchange_device', None)
-cuda_aliases: dict[torch.Tensor, torch.Tensor | None] = WeakTensorKeyDictionary() # pyright: ignore [reportAssignmentType]
-tensor_packs: list[ZeroGPUTensorPack] = []
-class ZeroGPUTensor(torch.Tensor):
-    pass
-def empty_fake(tensor: torch.Tensor):
-    fake = torch.empty_like(tensor, requires_grad=tensor.requires_grad)
-    if fake.__class__ != tensor.__class__:
-        fake = _tensor_make_subclass(tensor.__class__, fake, require_grad=tensor.requires_grad) # pyright: ignore [reportArgumentType]
-    return fake
-class ZeroGPUFunctionMode(TorchFunctionMode):
-    def __torch_function__(self, func, types, args=(), kwargs: dict[str, Any] | None = None):
-        kwargs = {} if kwargs is None else kwargs
-        if func == torch._C._nn._parse_to:
-            return func(*args, **kwargs)
-        # Redispatch: tensor.cuda() -> tensor.to(device='cuda')
-        if func == torch.Tensor.cuda or func == torch.Tensor.cpu:
-            memory_format = kwargs.get('memory_format')
-            return self.__torch_function__(torch.Tensor.to, types, (args[0],), {
-                'device': 'cuda' if func == torch.Tensor.cuda else 'cpu',
-                **({'memory_format': memory_format} if memory_format is not None else {}),
-            })
-        # Redispatch: tensor.to('cuda') -> tensor.to(device='cuda')
-        if func == torch.Tensor.to and len(args) > 1:
-            device, dtype, _, memory_format = torch._C._nn._parse_to(*args[1:], **kwargs)
-            return self.__torch_function__(torch.Tensor.to, types, (args[0],), {
-                'device': device,
-                'dtype': dtype,
-                'memory_format': memory_format,
-            })
-        if func == torch.Tensor.data.__set__: # pyright: ignore [reportAttributeAccessIssue]
-            self, target = args
-            if target in cuda_aliases:
-                if (target_original := cuda_aliases[target]) is None:
-                    raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), target))
-                original = empty_fake(self)
-                original.data = target_original
-                cuda_aliases[self] = original
-            elif self in cuda_aliases:
-                del cuda_aliases[self]
-            self.data = target
-            return
-        if func == torch.Tensor.device.__get__:
-            tensor, = args
-            if tensor in cuda_aliases:
-                return torch.device('cuda', index=0)
-        elif func == torch.Tensor.__repr__:
-            tensor, = args
-            if tensor in cuda_aliases:
-                if (original := cuda_aliases[tensor]) is None:
-                    original = tensor.to('meta')
-                original_class = original.__class__
-                original.__class__ = ZeroGPUTensor
-                try:
-                    return func(original, **kwargs)
-                finally:
-                    original.__class__ = original_class
-        elif func == torch.Tensor.untyped_storage:
-            tensor, = args
-            if tensor in cuda_aliases:
-                if (original := cuda_aliases[tensor]) is None:
-                    raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), tensor))
-                res = func(original, **kwargs)
-                res._zerogpu = True
-                return res
-        cuda: bool | None = None
-        # Handle device kwarg
-        if (device := kwargs.get('device')) is not None:
-            device = torch.device(device)
-            if device.type == 'cuda':
-                kwargs['device'] = torch.device('cpu')
-                cuda = True
-            else:
-                cuda = False
-        # Swap fake inputs with original data
-        swapped = {}
-        inputs_are_cuda = set()
-        def swap(tensor: torch.Tensor):
-            nonlocal inputs_are_cuda
-            if tensor not in cuda_aliases:
-                inputs_are_cuda |= {False}
-                return tensor
-            if (original := cuda_aliases[tensor]) is None:
-                raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), tensor))
-            swapped[original] = tensor
-            inputs_are_cuda |= {True}
-            return original
-        args_ = tree_map_only(torch.Tensor, swap, args)
-        kwargs_ = tree_map_only(torch.Tensor, swap, kwargs)
-        if inputs_are_cuda == {True}:
-            if cuda is not False:
-                cuda = True
-        res = func(*args_, **kwargs_)
-        # Re-generate swapped fakes in case of mutation
-        for original, fake in swapped.items():
-            fake.data = empty_fake(original)
-        # Special case for Tensor indexing where only 'self' matters
-        if func in {
-            torch.ops.aten.index.Tensor, # pyright: ignore [reportAttributeAccessIssue]
-            torch.Tensor.__getitem__, # PyTorch 2.4+
-        }:
-            self = args[0]
-            cuda = self in cuda_aliases
-            inputs_are_cuda = {cuda}
-        # Emulate device check
-        if isinstance(res, torch.Tensor) or func in OPS_INPUTS_CHECK_NO_RETURN:
-            self = None
-            if len(args_) >= 1 and isinstance(args_[0], torch.Tensor):
-                self = args_[0]
-            # Only raise if func does not return its first input (Tensor.copy_)
-            if res is not self or func in OPS_INPUT_CHECK_SELF_RETURN:
-                if inputs_are_cuda == {True, False}:
-                    raise RuntimeError(
-                        "Expected all tensors to be on the same device, "
-                        "but found at least two devices, cuda:0 (ZeroGPU) and cpu!"
-                    )
-        # Register output
-        def register(tensor: torch.Tensor):
-            if tensor in swapped and cuda is not False:
-                return swapped[tensor]
-            if cuda is not True:
-                return tensor
-            fake = empty_fake(tensor)
-            cuda_aliases[fake] = tensor
-            return fake
-        return tree_map_only(torch.Tensor, register, res)
-# When enabling DispatchMode, some aten ops are dispatched to FunctionMode
-# We are using it for aten.alias.default and aten.set_.source_Tensor
-class DefaultDispatchMode(TorchDispatchMode):
-    def __torch_dispatch__(self, func, types, args=(), kwargs: dict[str, Any] | None = None):
-        return func(*args, **(kwargs or {}))
-function_mode = ZeroGPUFunctionMode()
-dispatch_mode = DefaultDispatchMode()
-def _untyped_storage_new_register(*args, **kwargs):
-    cuda = False
-    if (device := kwargs.get('device')) is not None and device.type == 'cuda':
-        cuda = True
-        del kwargs['device']
-    storage = torch._C.StorageBase.__new__(*args, **kwargs)
-    if cuda:
-        storage._zerogpu = True
-    return storage
-@property
-def _untyped_storage_device(self):
-    if hasattr(self, '_zerogpu'):
-        return torch.device('cuda', index=0)
-    return torch._C.StorageBase.device.__get__(self) # pyright: ignore [reportAttributeAccessIssue]
-# Force dispatch
-def _tensor_make_subclass_function_mode(*args, **kwargs):
-    with torch._C.DisableTorchFunction():
-        return function_mode.__torch_function__(_tensor_make_subclass, (), args=args, kwargs=kwargs)
-def _asarray_function_mode(*args, **kwargs):
-    with torch._C.DisableTorchFunction():
-        return function_mode.__torch_function__(_asarray, (), args=args, kwargs=kwargs)
-def _cuda_init_raise():
-    raise RuntimeError(
-        "CUDA must not be initialized in the main process "
-        "on Spaces with Stateless GPU environment.\n"
-        "You can look at this Stacktrace to find out "
-        "which part of your code triggered a CUDA init"
-    )
-def _cuda_dummy_exchange_device(device):
-    assert device in {-1, 0}
-    return device
-def patch():
-    function_mode.__enter__()
-    dispatch_mode.__enter__()
-    # TODO: only patch bellow methods on current Thread to be consistent with TorchModes
-    # (or hijack threading.Thread.__init__ to force Modes on all threads)
-    torch.Tensor._make_subclass = _tensor_make_subclass_function_mode # pyright: ignore [reportAttributeAccessIssue]
-    torch.UntypedStorage.__new__ = _untyped_storage_new_register
-    torch.UntypedStorage.device  = _untyped_storage_device # pyright: ignore [reportAttributeAccessIssue]
-    torch.asarray           = _asarray_function_mode
-    torch._C._cuda_init     = _cuda_init_raise
-    torch.cuda._exchange_device = _cuda_dummy_exchange_device
-    torch.cuda.is_available   = lambda: True
-    torch.cuda.device_count   = lambda: 1
-    torch.cuda.current_device = lambda: 0
-    torch.cuda.mem_get_info   = lambda *args, **kwargs: CUDA_MEM_GET_INFO
-    torch.cuda.get_device_capability = lambda *args, **kwargs: CUDA_DEVICE_CAPABILITY
-    torch.cuda.get_device_properties = lambda *args, **kwargs: CUDA_DEVICE_PROPERTIES
-    torch.cuda.get_device_name       = lambda *args, **kwargs: CUDA_DEVICE_NAME
-    # PyTorch 2.3
-    if _cuda_maybe_exchange_device is not None: # pragma: no cover
-        setattr(torch.cuda, '_maybe_exchange_device', _cuda_dummy_exchange_device)
-    bitsandbytes.patch()
-def unpatch():
-    try:
-        dispatch_mode.__exit__(None, None, None)
-        function_mode.__exit__(None, None, None)
-    except RuntimeError:
-        pass # patch() and unpatch() called from != threads
-    torch.Tensor._make_subclass = _tensor_make_subclass
-    torch.UntypedStorage.__new__ = torch._C.StorageBase.__new__
-    torch.UntypedStorage.device  = torch._C.StorageBase.device # pyright: ignore [reportAttributeAccessIssue]
-    torch.asarray           = _asarray
-    torch._C._cuda_init     = _cuda_init
-    torch.cuda._exchange_device = _cuda_exchange_device
-    torch.cuda.is_available   = _cuda_available
-    torch.cuda.device_count   = _cuda_device_count
-    torch.cuda.current_device = _cuda_current_device
-    torch.cuda.mem_get_info   = _cuda_mem_get_info
-    torch.cuda.get_device_capability = _cuda_get_device_capability
-    torch.cuda.get_device_properties = _cuda_get_device_properties
-    torch.cuda.get_device_name       = _cuda_get_device_name
-    # PyTorch 2.3
-    if _cuda_maybe_exchange_device is not None: # pragma: no cover
-        setattr(torch.cuda, '_maybe_exchange_device', _cuda_exchange_device)
-    bitsandbytes.unpatch()
-def _total_unpacked_size():
-    tensors = [tensor for tensor in cuda_aliases.values() if tensor is not None]
-    deduped = {AliasId.from_tensor(tensor): tensor for tensor in tensors}
-    return sum([tensor.numel() * tensor.element_size() for tensor in deduped.values()])
-def _pack(offload_dir: str):
-    # Pack to disk
-    originals: set[torch.Tensor] = set()
-    originals_dedup: dict[AliasId, torch.Tensor] = {}
-    fakes: dict[torch.Tensor, list[torch.Tensor]] = defaultdict(list)
-    for fake, original in cuda_aliases.items():
-        # TODO filter-out sparse Tensors
-        if original is not None:
-            original_id = AliasId.from_tensor(original)
-            if original_id not in originals_dedup:
-                originals_dedup[original_id] = original
-                originals |= {original}
-            fakes[originals_dedup[original_id]] += [fake]
-    progress = tqdm(
-        total=_total_unpacked_size(),
-        unit='B',
-        unit_scale=True,
-        desc="ZeroGPU tensors packing",
-    ) if tqdm is not None else nullcontext()
-    with progress as progress:
-        update = progress.update if progress is not None else lambda _: None
-        pack = pack_tensors(originals, fakes, offload_dir, callback=update)
-    tensor_packs.append(pack)
-    # Free memory
-    for fake_list in fakes.values():
-        for fake in fake_list:
-            cuda_aliases[fake] = None
-def pack():
-    _pack(Config.zerogpu_offload_dir)
-    gc.collect()
-    malloc_trim()
-def init(nvidia_uuid: str):
-    os.environ['CUDA_VISIBLE_DEVICES'] = nvidia_uuid
-    torch.Tensor([0]).cuda()
-def size():
-    return _total_unpacked_size() + sum([pack.total_size for pack in tensor_packs])
-def _move(callback: Callable[[int]] | None = None):
-    callback = callback if callback is not None else lambda _: None
-    # CPU -> CUDA
-    moved: dict[AliasId, torch.Tensor] = {}
-    for fake, original in cuda_aliases.items():
-        if original is not None:
-            original_id = AliasId.from_tensor(original)
-            if original_id not in moved:
-                moved[original_id] = original.cuda()
-                callback(fake.numel() * fake.element_size())
-    for fake, original in cuda_aliases.items():
-        if original is not None:
-            fake.data = moved[AliasId.from_tensor(original)]
-    # Disk -> CUDA
-    for tensor_pack in tensor_packs:
-        pack_to_cuda(tensor_pack, callback=callback)
-    bitsandbytes.move()
-def move(callback: Callable[[int]] | None = None):
-    callback = callback if callback is not None else lambda _: None
-    with ThreadPoolExecutor(1) as e:
-        e.submit(copy_context().run, _move, callback=callback).result()
-    torch.cuda.synchronize()
-def is_in_bad_fork():
-    with ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as e:
-        f = e.submit(torch.cuda._is_in_bad_fork)
-        return f.result()

spaces/zero/torch/patching_legacy.py DELETED Viewed

@@ -1,266 +0,0 @@
-"""
-"""
-# pyright: reportPrivateImportUsage=false
-from __future__ import annotations
-import multiprocessing
-import os
-from concurrent.futures import ProcessPoolExecutor
-from contextlib import suppress
-from functools import partial
-from types import SimpleNamespace
-from typing import Any
-from typing import Callable
-from typing import Optional
-from typing import Tuple
-import torch
-from torch.utils.weak import WeakTensorKeyDictionary
-from ...config import Config
-from . import bitsandbytes
-# Nvidia A100.80G MIG (drivers 535) / Torch 2.2.0
-CUDA_DEVICE_NAME = 'NVIDIA A100-SXM4-80GB MIG 3g.40gb'
-CUDA_TOTAL_MEMORY = 42144366592
-CUDA_MEM_GET_INFO = (41911451648, CUDA_TOTAL_MEMORY)
-CUDA_DEVICE_CAPABILITY = (8, 0)
-CUDA_DEVICE_PROPERTIES = SimpleNamespace(name=CUDA_DEVICE_NAME, major=8, minor=0, total_memory=CUDA_TOTAL_MEMORY, multi_processor_count=42)
-GENERIC_METHOD_NAMES = [
-    'arange',
-    'as_tensor',
-    'asarray',
-    'bartlett_window',
-    'blackman_window',
-    'empty',
-    'empty_like',
-    'empty_strided',
-    'eye',
-    'full',
-    'full_like',
-    'hamming_window',
-    'hann_window',
-    'kaiser_window',
-    'linspace',
-    'logspace',
-    'ones',
-    'ones_like',
-    'rand',
-    'rand_like',
-    'randint',
-    'randint_like',
-    'randn',
-    'randn_like',
-    'randperm',
-    'range',
-    'sparse_bsc_tensor',
-    'sparse_bsr_tensor',
-    'sparse_compressed_tensor',
-    'sparse_coo_tensor',
-    'sparse_csc_tensor',
-    'sparse_csr_tensor',
-    'tensor',
-    'tril_indices',
-    'triu_indices',
-    'zeros',
-    'zeros_like',
-]
-TO_CUDA = (torch.device('cuda'), None, False, None)
-_tensor__deepcopy__ = torch.Tensor.__deepcopy__
-_tensor_to         = torch.Tensor.to
-_tensor_cuda       = torch.Tensor.cuda
-_tensor_cpu        = torch.Tensor.cpu
-_torch_generics    = {name: getattr(torch, name) for name in GENERIC_METHOD_NAMES}
-_cuda_init         = torch._C._cuda_init
-_cuda_available      = torch.cuda.is_available
-_cuda_device_count   = torch.cuda.device_count
-_cuda_current_device = torch.cuda.current_device
-_cuda_mem_get_info   = torch.cuda.mem_get_info
-_cuda_get_device_capability   = torch.cuda.get_device_capability
-_cuda_get_device_properties   = torch.cuda.get_device_properties
-_cuda_get_device_name         = torch.cuda.get_device_name
-TensorToArgs = Tuple[Optional[torch.device], Optional[torch.dtype], bool, Optional[torch.memory_format]]
-to_ops: dict[torch.Tensor, TensorToArgs] = WeakTensorKeyDictionary() # type: ignore
-def _tensor_new_register(*args, **kwargs):
-    new_tensor: torch.Tensor = torch._C._TensorBase.__new__(*args, **kwargs)
-    if (base_tensor := new_tensor._base) is not None:
-        if base_tensor in to_ops:
-            to_ops[new_tensor] = to_ops[base_tensor]
-    return new_tensor
-def _tensor_deepcopy_register(self: torch.Tensor, memo):
-    new_tensor = _tensor__deepcopy__(self, memo)
-    if isinstance(new_tensor, torch.Tensor):
-        if self in to_ops:
-            to_ops[new_tensor] = to_ops[self]
-    return new_tensor
-@property
-def _tensor_device_property(self: torch.Tensor):
-    if self in to_ops:
-        return torch.device(type='cuda', index=0)
-    del torch.Tensor.device
-    try:
-        return self.device
-    finally:
-        torch.Tensor.device = _tensor_device_property # type: ignore
-@property
-def _tensor_dtype_property(self: torch.Tensor):
-    if self in to_ops:
-        if (to_dtype := to_ops[self][1]) is not None:
-            return to_dtype
-    del torch.Tensor.dtype
-    try:
-        return self.dtype
-    finally:
-        torch.Tensor.dtype = _tensor_dtype_property # type: ignore
-def _to_op_register(self: torch.Tensor, *args, **kwargs):
-    parsed = torch._C._nn._parse_to(*args, **kwargs)
-    device, dtype, *_ = parsed
-    try:
-        to_args = to_ops.pop(self)
-    except KeyError:
-        to_args = None
-    if device is None: # pyright: ignore [reportUnnecessaryComparison]
-        if to_args is not None:
-            to_ops[self] = (to_args[0], dtype, *to_args[2:])
-            return self
-        return _tensor_to(self, *args, **kwargs)
-    if device.type != 'cuda':
-        if to_args is not None:
-            if (to_dtype := to_args[1]) is not None:
-                kwargs = {'dtype': to_dtype, **kwargs}
-        return _tensor_to(self, *args, **kwargs)
-    to_ops[self] = parsed
-    return self
-def _cuda_op_arg_check(device: torch.device | int | str | None) -> bool:
-    if device is None:
-        return True
-    if isinstance(device, int):
-        return True
-    if isinstance(device, str):
-        device = torch.device(device)
-    return device.type == 'cuda'
-def _cuda_op_register(self: torch.Tensor, device: torch.device | int | str | None = None, **kwargs):
-    if not _cuda_op_arg_check(device):
-        # Let PyTorch handle the fail
-        return _tensor_cuda(self, device, **kwargs)
-    to_ops[self] = TO_CUDA
-    return self
-def _cpu_op_remove(self: torch.Tensor, **kwargs):
-    try:
-        to_args = to_ops.pop(self)
-    except KeyError:
-        to_args = None
-    if to_args is not None:
-        if (to_dtype := to_args[1]) is not None:
-            return _tensor_to(self, 'cpu', **{'dtype': to_dtype, **kwargs})
-    return _tensor_cpu(self, **kwargs)
-def _cuda_init_raise():
-    raise RuntimeError(
-        "CUDA must not be initialized in the main process "
-        "on Spaces with Stateless GPU environment.\n"
-        "You can look at this Stacktrace to find out "
-        "which part of your code triggered a CUDA init"
-    )
-def _generic_method_register(name: str, *args: Any, **kwargs: Any):
-    try:
-        device = torch.device(kwargs.get('device', "cpu"))
-    except Exception:
-        return _torch_generics[name](*args, **kwargs)
-    if device.type != 'cuda':
-        return _torch_generics[name](*args, **kwargs)
-    tensor = _torch_generics[name](*args, **{**kwargs, 'device': "cpu"})
-    to_ops[tensor] = TO_CUDA
-    return tensor
-def patch():
-    torch.Tensor.__deepcopy__ = _tensor_deepcopy_register
-    torch.Tensor.__new__      = _tensor_new_register # pyright: ignore [reportAttributeAccessIssue]
-    torch.Tensor.to         = _to_op_register   # type: ignore
-    torch.Tensor.cuda       = _cuda_op_register # type: ignore
-    torch.Tensor.cpu        = _cpu_op_remove # type: ignore
-    if Config.zero_patch_torch_device:
-        torch.Tensor.device = _tensor_device_property # type: ignore
-        torch.Tensor.dtype  = _tensor_dtype_property # pyright: ignore [reportAttributeAccessIssue]
-    for name in GENERIC_METHOD_NAMES:
-        setattr(torch, name, partial(_generic_method_register, name))
-    torch._C._cuda_init     = _cuda_init_raise
-    torch.cuda.is_available   = lambda: True
-    torch.cuda.device_count   = lambda: 1
-    torch.cuda.current_device = lambda: 0
-    torch.cuda.mem_get_info   = lambda *args, **kwargs: CUDA_MEM_GET_INFO
-    torch.cuda.get_device_capability = lambda *args, **kwargs: CUDA_DEVICE_CAPABILITY
-    torch.cuda.get_device_properties = lambda *args, **kwargs: CUDA_DEVICE_PROPERTIES
-    torch.cuda.get_device_name       = lambda *args, **kwargs: CUDA_DEVICE_NAME
-    bitsandbytes.patch()
-def unpatch():
-    torch.Tensor.__deepcopy__ = _tensor__deepcopy__
-    with suppress(AttributeError):
-        del torch.Tensor.__new__
-    torch.Tensor.to         = _tensor_to
-    torch.Tensor.cuda       = _tensor_cuda
-    torch.Tensor.cpu        = _tensor_cpu
-    with suppress(AttributeError):
-        del torch.Tensor.device
-    with suppress(AttributeError):
-        del torch.Tensor.dtype
-    for name in GENERIC_METHOD_NAMES:
-        setattr(torch, name, _torch_generics[name])
-    torch._C._cuda_init     = _cuda_init
-    torch.cuda.is_available   = _cuda_available
-    torch.cuda.device_count   = _cuda_device_count
-    torch.cuda.current_device = _cuda_current_device
-    torch.cuda.mem_get_info   = _cuda_mem_get_info
-    torch.cuda.get_device_capability = _cuda_get_device_capability
-    torch.cuda.get_device_properties = _cuda_get_device_properties
-    torch.cuda.get_device_name       = _cuda_get_device_name
-    bitsandbytes.unpatch()
-def pack():
-    pass
-def init(nvidia_uuid: str):
-    os.environ['CUDA_VISIBLE_DEVICES'] = nvidia_uuid
-    torch.Tensor([0]).cuda() # CUDA init
-def size():
-    return 0
-def move(callback: Callable[[int]] | None = None):
-    for op in to_ops.items():
-        tensor, parsed_args = op
-        _, dtype, _, memory_format = parsed_args
-        tensor.data = _tensor_to(tensor,
-            device='cuda',
-            dtype=dtype,
-            memory_format=memory_format,
-        ) # type: ignore
-    bitsandbytes.move()
-    torch.cuda.synchronize()
-def is_in_bad_fork():
-    with ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as e:
-        f = e.submit(torch.cuda._is_in_bad_fork)
-        return f.result()
-def disable_cuda_intercept():
-    torch.Tensor.to   = _tensor_to
-    torch.Tensor.cuda = _tensor_cuda

spaces/zero/torch/types.py DELETED Viewed

@@ -1,23 +0,0 @@
-"""
-"""
-from __future__ import annotations
-from typing import NamedTuple
-import torch
-class AliasId(NamedTuple):
-    data_ptr: int
-    dtype: torch.dtype
-    shape: tuple[int, ...]
-    stride: tuple[int, ...]
-    @classmethod
-    def from_tensor(cls, tensor: torch.Tensor):
-        return cls(
-            tensor.data_ptr(),
-            tensor.dtype,
-            tensor.shape,
-            tensor.stride(),
-        )

spaces/zero/tqdm.py DELETED Viewed

@@ -1,24 +0,0 @@
-"""
-"""
-from multiprocessing.synchronize import RLock as MultiprocessingRLock
-try:
-    from tqdm import tqdm as _tqdm
-except ImportError: # pragma: no cover
-    _tqdm = None
-def remove_tqdm_multiprocessing_lock():
-    if _tqdm is None: # pragma: no cover
-        return
-    tqdm_lock = _tqdm.get_lock()
-    assert tqdm_lock.__class__.__name__ == 'TqdmDefaultWriteLock'
-    tqdm_lock.locks = [
-        lock for lock in tqdm_lock.locks
-        if not isinstance(lock, MultiprocessingRLock)
-    ]
-tqdm = _tqdm

spaces/zero/types.py DELETED Viewed

@@ -1,49 +0,0 @@
-"""
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from datetime import timedelta
-from typing import Any
-from typing import Dict
-from typing import Tuple
-from typing import TypedDict
-from typing_extensions import Callable
-from typing_extensions import Generic
-from typing_extensions import ParamSpec
-from typing_extensions import TypeAlias
-from typing_extensions import TypeVar
-Params = Tuple[Tuple[object, ...], Dict[str, Any]]
-Res = TypeVar('Res')
-Param = ParamSpec('Param')
-class EmptyKwargs(TypedDict):
-    pass
-@dataclass
-class OkResult(Generic[Res]):
-    value: Res
-@dataclass
-class ExceptionResult:
-    value: Exception
-@dataclass
-class AbortedResult:
-    pass
-@dataclass
-class EndResult:
-    pass
-@dataclass
-class GradioQueueEvent:
-    method_name: str
-    args: tuple[Any, ...]
-    kwargs: dict[str, Any]
-RegularResQueueResult:   TypeAlias = "OkResult[Res] | ExceptionResult | GradioQueueEvent"
-GeneratorResQueueResult: TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | GradioQueueEvent"
-YieldQueueResult:        TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | AbortedResult"
-Duration:        TypeAlias = "int | timedelta"
-DynamicDuration: TypeAlias = "Duration | Callable[Param, Duration] | None"

spaces/zero/wrappers.py DELETED Viewed

@@ -1,418 +0,0 @@
-"""
-"""
-from __future__ import annotations
-import multiprocessing
-import os
-import signal
-import traceback
-import warnings
-from concurrent.futures import ThreadPoolExecutor
-from contextlib import nullcontext
-from contextvars import copy_context
-from datetime import timedelta
-from functools import partial
-from functools import wraps
-from multiprocessing.context import ForkProcess
-from pickle import PicklingError
-from queue import Empty
-from queue import Queue as ThreadQueue
-from threading import Thread
-from typing import TYPE_CHECKING
-from typing import Callable
-from typing import Generator
-from typing import Generic
-from typing_extensions import assert_never
-import psutil
-from ..config import Config
-from ..utils import debug
-from ..utils import drop_params
-from ..utils import gradio_request_var
-from ..utils import SimpleQueue as Queue
-from . import client
-from . import torch
-from .api import AllowToken
-from .api import NvidiaIndex
-from .api import NvidiaUUID
-from .gradio import GradioPartialContext
-from .gradio import get_server_port
-from .gradio import patch_gradio_queue
-from .gradio import try_process_queue_event
-from .tqdm import remove_tqdm_multiprocessing_lock
-from .tqdm import tqdm
-from .types import * # TODO: Please don't do that
-GENERATOR_GLOBAL_TIMEOUT = 20 * 60
-SPAWN_PROGRESS_CLEANUP = 0.1
-SPAWN_PROGRESS_INIT = 0.1
-Process = multiprocessing.get_context('fork').Process
-forked = False
-class Worker(Generic[Res]):
-    process: ForkProcess
-    arg_queue: Queue[tuple[Params, GradioPartialContext]]
-    res_queue: Queue[Res | None]
-    _sentinel: Thread
-    def __init__(
-        self,
-        target: Callable[[
-            Queue[tuple[Params, GradioPartialContext]],
-            Queue[Res | None],
-            AllowToken,
-            NvidiaUUID,
-            list[int],
-        ], None],
-        allow_token: str,
-        nvidia_uuid: str,
-    ):
-        self._sentinel = Thread(target=self._close_on_exit, daemon=True)
-        self.arg_queue = Queue()
-        self.res_queue = Queue()
-        debug(f"{self.arg_queue._writer.fileno()=}") # pyright: ignore [reportAttributeAccessIssue]
-        debug(f"{self.res_queue._writer.fileno()=}") # pyright: ignore [reportAttributeAccessIssue]
-        if (server_port := get_server_port()) is not None:
-            fds = [c.fd for c in psutil.Process().connections() if c.laddr.port == server_port]
-            debug(f"{fds=}")
-        else:
-            warnings.warn("Using a ZeroGPU function outside of Gradio caching or request might block the app")
-            fds = []
-        args = self.arg_queue, self.res_queue, allow_token, nvidia_uuid, fds
-        if TYPE_CHECKING:
-            target(*args)
-        self.process = Process(
-            target=target,
-            args=args,
-            daemon=True,
-        )
-        self.process.start()
-        self._sentinel.start()
-    def _close_on_exit(self):
-        self.process.join()
-        self.arg_queue.close()
-        self.res_queue.wlock_release()
-        self.res_queue.put(None)
-def worker_init(
-    res_queue: Queue[RegularResQueueResult | None] | Queue[GeneratorResQueueResult | None],
-    allow_token: str,
-    nvidia_uuid: str,
-    fds: list[int],
-) -> None | ExceptionResult:
-    # Immediately close file descriptors
-    for fd in fds:
-        try:
-            os.close(fd)
-        except Exception as e: # pragma: no cover
-            if isinstance(e, OSError) and e.errno == 9:
-                continue
-            traceback.print_exc()
-            return ExceptionResult(e)
-    progress = nullcontext()
-    if tqdm is not None and Config.zero_gpu_v2:
-        progress = tqdm(total=100, desc="ZeroGPU init", file=open(os.devnull, 'w'))
-    try: # Unrecoverable init part
-        patch_gradio_queue(res_queue)
-        with progress as progress:
-            current_progress = 0 # Gradio does not support float progress updates
-            def update(n: float):
-                nonlocal current_progress
-                current_progress += n
-                if progress is not None:
-                    progress.update(round(current_progress * 100) - progress.n)
-            client.allow(allow_token)
-            update(SPAWN_PROGRESS_CLEANUP)
-            torch.unpatch()
-            torch.init(nvidia_uuid)
-            update(SPAWN_PROGRESS_INIT)
-            callback = None
-            if (transfer_size := torch.size()) > 0:
-                remaining = 1 - (SPAWN_PROGRESS_CLEANUP + SPAWN_PROGRESS_INIT)
-                callback = lambda n: update(n * remaining / transfer_size)
-            torch.move(callback=callback)
-    except Exception as e: # pragma: no cover
-        traceback.print_exc()
-        return ExceptionResult(e)
-    try:
-        remove_tqdm_multiprocessing_lock()
-    except Exception: # pragma: no cover
-        print("Error while trying to remove tqdm mp_lock:")
-        traceback.print_exc()
-def process_duration(duration: Duration | None):
-    if duration is None or isinstance(duration, timedelta):
-        return duration
-    return timedelta(seconds=duration)
-def static_duration(duration: DynamicDuration[Param], *args: Param.args, **kwargs: Param.kwargs):
-    if not callable(duration):
-        return duration
-    return duration(*args, **kwargs)
-def regular_function_wrapper(
-    task: Callable[Param, Res],
-    duration: DynamicDuration[Param],
-) -> Callable[Param, Res]:
-    import gradio as gr
-    request_var = gradio_request_var()
-    workers: dict[NvidiaIndex, Worker[RegularResQueueResult[Res]]] = {}
-    task_id = id(task)
-    @wraps(task)
-    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Res:
-        if forked:
-            return task(*args, **kwargs)
-        request = request_var.get()
-        duration_ = static_duration(duration, *args, **kwargs)
-        duration_ = process_duration(duration_)
-        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration_)
-        allow_token = schedule_response.allowToken
-        nvidia_index = schedule_response.nvidiaIndex
-        nvidia_uuid = schedule_response.nvidiaUUID
-        release = partial(client.release, allow_token)
-        try:
-            worker = workers.pop(nvidia_index)
-        except KeyError:
-            worker = None
-        if worker is not None and worker.process.is_alive() and schedule_response.idle:
-            assert worker.arg_queue.empty()
-            assert worker.res_queue.empty()
-        else:
-            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
-        try:
-            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
-        except PicklingError: # TODO: detailed serialization diagnostic
-            release(fail=True)
-            raise
-        while True:
-            res = worker.res_queue.get()
-            if res is None:
-                release(fail=True, allow_404=True)
-                raise gr.Error("GPU task aborted")
-            if isinstance(res, ExceptionResult):
-                release(fail=True)
-                raise res.value
-            if isinstance(res, OkResult):
-                release()
-                workers[nvidia_index] = worker
-                return res.value
-            if isinstance(res, GradioQueueEvent):
-                try_process_queue_event(res.method_name, *res.args, **res.kwargs)
-                continue
-            assert_never(res)
-    def thread_wrapper(
-        arg_queue: Queue[tuple[Params, GradioPartialContext]],
-        res_queue: Queue[RegularResQueueResult[Res] | None],
-        allow_token: str,
-        nvidia_uuid: str,
-        fds: list[int],
-    ):
-        global forked
-        forked = True
-        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
-        initialized = False
-        while True:
-            try:
-                (args, kwargs), gradio_context = arg_queue.get()
-            except OSError:
-                break
-            if not initialized:
-                if (res := worker_init(
-                    res_queue=res_queue,
-                    allow_token=allow_token,
-                    nvidia_uuid=nvidia_uuid,
-                    fds=fds,
-                )) is not None:
-                    res_queue.put(res)
-                    return
-                initialized = True
-            GradioPartialContext.apply(gradio_context)
-            context = copy_context()
-            with ThreadPoolExecutor() as executor:
-                future = executor.submit(context.run, task, *args, **kwargs) # type: ignore
-            try:
-                res = future.result()
-            except Exception as e:
-                traceback.print_exc()
-                res = ExceptionResult(e)
-            else:
-                res = OkResult(res)
-            try:
-                res_queue.put(res)
-            except PicklingError as e:
-                res_queue.put(ExceptionResult(e))
-    # https://github.com/python/cpython/issues/91002
-    if not hasattr(task, '__annotations__'):
-        gradio_handler.__annotations__ = {}
-    return gradio_handler
-def generator_function_wrapper(
-    task: Callable[Param, Generator[Res, None, None]],
-    duration: DynamicDuration[Param],
-) -> Callable[Param, Generator[Res, None, None]]:
-    import gradio as gr
-    request_var = gradio_request_var()
-    workers: dict[NvidiaIndex, Worker[GeneratorResQueueResult[Res]]] = {}
-    task_id = id(task)
-    @wraps(task)
-    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Generator[Res, None, None]:
-        if forked:
-            yield from task(*args, **kwargs)
-            return
-        request = request_var.get()
-        duration_ = static_duration(duration, *args, **kwargs)
-        duration_ = process_duration(duration_)
-        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration_)
-        allow_token = schedule_response.allowToken
-        nvidia_index = schedule_response.nvidiaIndex
-        nvidia_uuid = schedule_response.nvidiaUUID
-        release = partial(client.release, allow_token)
-        try:
-            worker = workers.pop(nvidia_index)
-        except KeyError:
-            worker = None
-        if worker is not None and worker.process.is_alive() and schedule_response.idle:
-            assert worker.arg_queue.empty()
-            assert worker.res_queue.empty()
-        else:
-            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
-        try:
-            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
-        except PicklingError: # TODO: detailed serialization diagnostic
-            release(fail=True)
-            raise
-        yield_queue: ThreadQueue[YieldQueueResult[Res]] = ThreadQueue()
-        def fill_yield_queue(worker: Worker[GeneratorResQueueResult[Res]]):
-            while True:
-                res = worker.res_queue.get()
-                if res is None:
-                    release(fail=True, allow_404=True)
-                    yield_queue.put(AbortedResult())
-                    return
-                if isinstance(res, ExceptionResult):
-                    release(fail=True)
-                    yield_queue.put(ExceptionResult(res.value))
-                    return
-                if isinstance(res, EndResult):
-                    release()
-                    workers[nvidia_index] = worker
-                    yield_queue.put(EndResult())
-                    return
-                if isinstance(res, OkResult):
-                    yield_queue.put(OkResult(res.value))
-                    continue
-                if isinstance(res, GradioQueueEvent): # pragma: no cover (not working properly on Gradio side)
-                    try_process_queue_event(res.method_name, *res.args, **res.kwargs)
-                    continue
-                debug(f"fill_yield_queue: assert_never({res=})")
-                assert_never(res)
-        from typing_extensions import assert_never
-        with ThreadPoolExecutor() as e:
-            f = e.submit(copy_context().run, fill_yield_queue, worker)
-            f.add_done_callback(lambda _: debug("fill_yield_queue DONE"))
-            while True:
-                try:
-                    res = yield_queue.get(timeout=GENERATOR_GLOBAL_TIMEOUT)
-                except Empty: # pragma: no cover
-                    debug(f"yield_queue TIMEOUT ({GENERATOR_GLOBAL_TIMEOUT=})")
-                    raise
-                if isinstance(res, AbortedResult):
-                    raise gr.Error("GPU task aborted")
-                if isinstance(res, ExceptionResult):
-                    raise res.value
-                if isinstance(res, EndResult):
-                    break
-                if isinstance(res, OkResult):
-                    yield res.value
-                    continue
-                debug(f"gradio_handler: assert_never({res=})")
-                assert_never(res)
-    def thread_wrapper(
-        arg_queue: Queue[tuple[Params, GradioPartialContext]],
-        res_queue: Queue[GeneratorResQueueResult[Res] | None],
-        allow_token: str,
-        nvidia_uuid: str,
-        fds: list[int],
-    ):
-        global forked
-        forked = True
-        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
-        initialized = False
-        while True:
-            try:
-                (args, kwargs), gradio_context = arg_queue.get()
-            except OSError:
-                break
-            if not initialized:
-                if (res := worker_init(
-                    res_queue=res_queue,
-                    allow_token=allow_token,
-                    nvidia_uuid=nvidia_uuid,
-                    fds=fds,
-                )) is not None:
-                    res_queue.put(res)
-                    return
-                initialized = True
-            def iterate():
-                gen = task(*args, **kwargs) # type: ignore
-                while True:
-                    try:
-                        res = next(gen)
-                    except StopIteration:
-                        break
-                    except Exception as e:
-                        res_queue.put(ExceptionResult(e))
-                        break
-                    try:
-                        res_queue.put(OkResult(res))
-                    except PicklingError as e:
-                        res_queue.put(ExceptionResult(e))
-                        break
-                    else:
-                        continue
-            GradioPartialContext.apply(gradio_context)
-            with ThreadPoolExecutor() as executor:
-                executor.submit(copy_context().run, iterate)
-            res_queue.put(EndResult())
-    # https://github.com/python/cpython/issues/91002
-    if not hasattr(task, '__annotations__'):
-        gradio_handler.__annotations__ = {}
-    return gradio_handler