Sorvad

Runtime error

App Files Files Community

vpcom commited on Oct 25, 2023

Commit

108fbc5

1 Parent(s): bd7413b

feat: implement our own InferenceClient

Browse files

Files changed (1) hide show

app.py +127 -4

app.py CHANGED Viewed

@@ -20,6 +20,14 @@ from gradio.components import (
     Textbox,
     get_component_instance,
 )
 from gradio.themes import ThemeClass as Theme
 import gradio as gr
@@ -30,6 +38,18 @@ import anyio
 from huggingface_hub import Repository, InferenceClient
 from utils import force_git_push
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
 MODEL_NAME = os.getenv("MODEL_NAME")
@@ -97,10 +117,113 @@ stop_sequences = ["<|endoftext|>"] # ":پایان","@","#","$",
 #     ["<%مهدی اخوان ثالث%"],
 #     ]
-client = InferenceClient(
     API_URL,
-    headers={"Authorization": f"Bearer {HF_TOKEN}",
-             "use_cache": False},
 )
 def asynchronous_push(f_stop):
@@ -209,7 +332,7 @@ additional_inputs=[
     ),
     gr.Slider(
         label="Top-p (nucleus sampling)",
-        value=1.0,
         minimum=0.0,
         maximum=1,
         step=0.05,

     Textbox,
     get_component_instance,
 )
+from huggingface_hub.utils import (
+    BadRequestError,
+    build_hf_headers,
+    get_session,
+    hf_raise_for_status,
+)
 from gradio.themes import ThemeClass as Theme
 import gradio as gr
 from huggingface_hub import Repository, InferenceClient
 from utils import force_git_push
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Union,
+    overload,
+)
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
 MODEL_NAME = os.getenv("MODEL_NAME")
 #     ["<%مهدی اخوان ثالث%"],
 #     ]
+class InferenceClientUS(InferenceClient):
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        token: Union[str, bool, None] = None,
+        timeout: Optional[float] = None,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            token=token,
+            timeout=timeout,
+            headers=headers,
+            cookies=cookies,
+        )
+    def post(
+        self,
+        *,
+        json: Optional[Union[str, Dict, List]] = None,
+        data: Optional[ContentT] = None,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
+        stream: bool = False,
+    ) -> Union[bytes, Iterable[bytes]]:
+        """
+        Make a POST request to the inference server.
+        Args:
+            json (`Union[str, Dict, List]`, *optional*):
+                The JSON data to send in the request body. Defaults to None.
+            data (`Union[str, Path, bytes, BinaryIO]`, *optional*):
+                The content to send in the request body. It can be raw bytes, a pointer to an opened file, a local file
+                path, or a URL to an online resource (image, audio file,...). If both `json` and `data` are passed,
+                `data` will take precedence. At least `json` or `data` must be provided. Defaults to None.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. Will override the model defined at the instance level. Defaults to None.
+            task (`str`, *optional*):
+                The task to perform on the inference. Used only to default to a recommended model if `model` is not
+                provided. At least `model` or `task` must be provided. Defaults to None.
+            stream (`bool`, *optional*):
+                Whether to iterate over streaming APIs.
+        Returns:
+            bytes: The raw bytes returned by the server.
+        Raises:
+            [`InferenceTimeoutError`]:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+        """
+        url = self._resolve_url(model, task)
+        if data is not None and json is not None:
+            warnings.warn("Ignoring `json` as `data` is passed as binary.")
+        # Set Accept header if relevant
+        headers = self.headers.copy()
+        if task in TASKS_EXPECTING_IMAGES and "Accept" not in headers:
+            headers["Accept"] = "image/png"
+        t0 = time.time()
+        timeout = self.timeout
+        while True:
+            with _open_as_binary(data) as data_as_binary:
+                try:
+                    response = get_session().post(
+                        url,
+                        json=json,
+                        data=data_as_binary,
+                        headers=headers,
+                        cookies=self.cookies,
+                        timeout=self.timeout,
+                        stream=stream,
+                    )
+                except TimeoutError as error:
+                    # Convert any `TimeoutError` to a `InferenceTimeoutError`
+                    raise InferenceTimeoutError(f"Inference call timed out: {url}") from error  # type: ignore
+            try:
+                hf_raise_for_status(response)
+                return response.iter_lines() if stream else response.content
+            except HTTPError as error:
+                if error.response.status_code == 503:
+                    # If Model is unavailable, either raise a TimeoutError...
+                    if timeout is not None and time.time() - t0 > timeout:
+                        raise InferenceTimeoutError(
+                            f"Model not loaded on the server: {url}. Please retry with a higher timeout (current:"
+                            f" {self.timeout}).",
+                            request=error.request,
+                            response=error.response,
+                        ) from error
+                    # ...or wait 1s and retry
+                    logger.info(f"Waiting for model to be loaded on the server: {error}")
+                    time.sleep(1)
+                    if timeout is not None:
+                        timeout = max(self.timeout - (time.time() - t0), 1)  # type: ignore
+                    continue
+                raise
+client = InferenceClientUS(
     API_URL,
+    headers={"Authorization": f"Bearer {HF_TOKEN}"},
 )
 def asynchronous_push(f_stop):
     ),
     gr.Slider(
         label="Top-p (nucleus sampling)",
+        value=0.9,
         minimum=0.0,
         maximum=1,
         step=0.05,