Spaces:

joshuasundance
/

govgis_nov2023-slim-faiss

Sleeping

App Files Files Community

Joshua Sundance Bailey commited on Nov 28, 2023

Commit

693876a

0 Parent(s):

initial commit

Browse files

Files changed (14) hide show

.gitattributes +35 -0
.github/ISSUE_TEMPLATE/bug_report.md +38 -0
.github/ISSUE_TEMPLATE/feature_request.md +17 -0
.github/dependabot.yml +11 -0
.github/pull_request_template.md +12 -0
.github/workflows/check-file-size-limit.yml +14 -0
.github/workflows/hf-space.yml +19 -0
.gitignore +94 -0
.pre-commit-config.yaml +65 -0
LICENSE +9 -0
README.md +23 -0
app.py +245 -0
bumpver.toml +16 -0
requirements.txt +11 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+**Additional context**
+Add any other context about the problem here.

.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+---
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"

.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,12 @@

+Thank you for contributing!
+Before submitting this PR, please make sure:
+- [ ] Your code builds clean without any errors or warnings
+- [ ] Your code doesn't break anything we can't fix
+- [ ] You have added appropriate tests
+Please check one or more of the following to describe the nature of this PR:
+- [ ] New feature
+- [ ] Bug fix
+- [ ] Documentation
+- [ ] Other

.github/workflows/check-file-size-limit.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: 10 MB file size limit
+on:
+  pull_request:
+    branches: [main]
+jobs:
+  check-file-sizes:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/[email protected]
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
+          token: ${{ secrets.WORKFLOW_GIT_ACCESS_TOKEN }}

.github/workflows/hf-space.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Push to HuggingFace Space
+on:
+  workflow_dispatch:
+jobs:
+  push-to-huggingface:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+        token: ${{ secrets.WORKFLOW_GIT_ACCESS_TOKEN }}
+    - name: Push to HuggingFace Space
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        git push https://joshuasundance:[email protected]/spaces/joshuasundance/langchain-streamlit-demo main

.gitignore ADDED Viewed

	@@ -0,0 +1,94 @@

+hf_cache/
+govgis-nov2023/
+*$py.class
+*.chainlit
+*.chroma
+*.cover
+*.egg
+*.egg-info/
+*.env
+*.langchain.db
+*.log
+*.manifest
+*.mo
+*.pot
+*.py,cover
+*.py[cod]
+*.sage.py
+*.so
+*.spec
+.DS_STORE
+.Python
+.cache
+.coverage
+.coverage.*
+.dmypy.json
+.eggs/
+.env
+.hypothesis/
+.idea
+.installed.cfg
+.ipynb_checkpoints
+.mypy_cache/
+.nox/
+.pyre/
+.pytest_cache/
+.python-version
+.ropeproject
+.ruff_cache/
+.scrapy
+.spyderproject
+.spyproject
+.tox/
+.venv
+.vscode
+.webassets-cache
+/site
+ENV/
+MANIFEST
+__pycache__
+__pycache__/
+__pypackages__/
+build/
+celerybeat-schedule
+celerybeat.pid
+coverage.xml
+credentials.json
+data/
+db.sqlite3
+db.sqlite3-journal
+develop-eggs/
+dist/
+dmypy.json
+docs/_build/
+downloads/
+eggs/
+env.bak/
+env/
+fly.toml
+htmlcov/
+instance/
+ipython_config.py
+junk/
+lib/
+lib64/
+local_settings.py
+models/*.bin
+nosetests.xml
+lab/scratch/
+lab/
+parts/
+pip-delete-this-directory.txt
+pip-log.txt
+pip-wheel-metadata/
+profile_default/
+sdist/
+share/python-wheels/
+storage
+target/
+token.json
+var/
+venv
+venv.bak/
+venv/
+wheels/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+# Don't know what this file is? See https://pre-commit.com/
+#   pip install pre-commit
+#   pre-commit install
+#   pre-commit autoupdate
+# Apply to all files without commiting:
+#   pre-commit run --all-files
+#   I recommend running this until you pass all checks, and then commit.
+#   Fix what you need to and then let the pre-commit hooks resolve their conflicts.
+#   You may need to git add -u between runs.
+exclude: "AI_CHANGELOG.md"
+repos:
+-   repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: "v0.1.6"
+    hooks:
+    -   id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --ignore, E501]
+-   repo: https://github.com/koalaman/shellcheck-precommit
+    rev: v0.9.0
+    hooks:
+    -   id: shellcheck
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-ast
+    -   id: check-builtin-literals
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: check-toml
+    -   id: check-xml
+    -   id: debug-statements
+    -   id: check-case-conflict
+    -   id: check-docstring-first
+    -   id: check-executables-have-shebangs
+    -   id: check-json
+#    -   id: check-yaml
+    -   id: debug-statements
+    -   id: fix-byte-order-marker
+    -   id: detect-private-key
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: mixed-line-ending
+    -   id: requirements-txt-fixer
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.7.1
+    hooks:
+    -   id: mypy
+        additional_dependencies:
+            - types-PyYAML
+-   repo: https://github.com/asottile/add-trailing-comma
+    rev: v3.1.0
+    hooks:
+    -   id: add-trailing-comma
+#-   repo: https://github.com/dannysepler/rm_unneeded_f_str
+#    rev: v0.2.0
+#    hooks:
+#    -   id: rm-unneeded-f-str
+-   repo: https://github.com/psf/black
+    rev: 23.11.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/PyCQA/bandit
+    rev: 1.7.5
+    hooks:
+    -   id: bandit
+        args: ["-x", "tests/*.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,9 @@

+MIT License
+Copyright (c) 2023 Joshua Sundance Bailey
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+title: Govgis Nov2023-slim-faiss
+emoji: 🐨
+colorFrom: red
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.28.2
+app_file: app.py
+pinned: false
+license: mit
+---
+# govgis_nov2023-slim-faiss
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![python](https://img.shields.io/badge/Python-3.11-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v1.json)](https://github.com/charliermarsh/ruff)
+[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+from operator import itemgetter
+from typing import Optional
+import streamlit as st
+import yaml
+from huggingface_hub import hf_hub_download
+from langchain.chat_models import ChatAnthropic
+from langchain.embeddings import HuggingFaceBgeEmbeddings
+from langchain.prompts import ChatPromptTemplate, PromptTemplate
+from langchain.schema.document import Document
+from langchain.schema.output_parser import StrOutputParser
+from langchain.vectorstores import FAISS
+DEFAULT_TEMPERATURE = 0.5
+DEFAULT_MAX_TOKENS = 512
+DEFAULT_SEARCH_RESULT_LIMIT = 3
+default_hf_home = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
+HF_HOME = os.environ.get("HF_HOME", default_hf_home)
+if "chain" not in st.session_state:
+    st.session_state.chain = None
+with st.sidebar:
+    st.session_state.search_result_limit = st.slider(
+        "Search Result Limit",
+        min_value=1,
+        max_value=10,
+        value=DEFAULT_SEARCH_RESULT_LIMIT,
+        step=1,
+    )
+    st.session_state.anthropic_api_key = st.text_input(
+        "Anthropic API Key",
+        type="password",
+    )
+    st.session_state.temperature = st.slider(
+        "Temperature",
+        min_value=0.0,
+        max_value=1.0,
+        value=DEFAULT_TEMPERATURE,
+        step=0.05,
+    )
+    st.session_state.max_tokens = st.slider(
+        "Max Tokens",
+        min_value=512,
+        max_value=12800,
+        value=DEFAULT_MAX_TOKENS,
+        step=256,
+    )
+    st.session_state.use_instant_for_rephrase = st.checkbox(
+        "Use `claude-instant-v1` to generate search query",
+        value=True,
+    )
+@st.cache_resource
+def get_embedding_model(device: str = "cpu", **kwargs) -> HuggingFaceBgeEmbeddings:
+    model_name = "BAAI/bge-large-en-v1.5"
+    model_kwargs = {"device": device}
+    encode_kwargs = {"normalize_embeddings": True}
+    return HuggingFaceBgeEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs,
+        cache_folder=HF_HOME,
+        **kwargs,
+    )
+@st.cache_data
+def download_data_from_hub(**kwargs) -> str:
+    repo_id = "joshuasundance/govgis_nov2023-slim-spatial"
+    filename = "govgis_nov2023-slim-nospatial.faiss.bytes"
+    repo_type = "dataset"
+    return hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        repo_type=repo_type,
+        cache_dir=HF_HOME,
+        **kwargs,
+    )
+@st.cache_resource
+def get_faiss(
+    serialized_bytes_path: Optional[str] = None,
+    embeddings: Optional[HuggingFaceBgeEmbeddings] = None,
+) -> FAISS:
+    serialized_bytes_path = serialized_bytes_path or download_data_from_hub()
+    with open(serialized_bytes_path, "rb") as infile:
+        return FAISS.deserialize_from_bytes(
+            embeddings=embeddings or get_embedding_model(),
+            serialized=infile.read(),
+        )
+def _combine_documents(
+    docs: list[Document],
+    document_separator: str = "\n\n",
+) -> str:
+    return document_separator.join(f"```yaml\n{doc.page_content}\n```" for doc in docs)
+rephrase_template = """Given the User Input, return an English natural language Search Query that will return the most relevant documents.
+Remember, you are working with a semantic search engine. It is not based solely on keywords or Google-Fu.
+Be creative with your search query.
+Your entire response will be fed directly into the search engine. Omit any text that is not part of the search query.
+User Input: {question}"""
+REPHRASE_QUESTION_PROMPT = PromptTemplate.from_template(rephrase_template)
+answer_template = """The following search results were found for the given user query.
+Provide a description of the relevant search results, providing relevant URLs and details.
+Describing the search results in the context of the query is more important than answering the query.
+Do not answer without referring to the search results; the search results are the most important part of the answer.
+Base your response on the search results.
+Always provide a URL when referencing a specific service, dataset, or API.
+If multiple search results are relevant to the user's query, describe each result separately.
+Describe what sets each result apart from the others.
+Be detailed and specific, so the user can find the information they need.
+Format your response as markdown as appropriate.
+----------------
+Search Results:
+{context}
+----------------
+Question: {question}"""
+ANSWER_PROMPT = ChatPromptTemplate.from_template(answer_template)
+def get_chain(rephrase_llm, answer_llm, retriever):
+    """
+    Return a chain that rephrases, retrieves, and responds.
+    Output keys:
+    - search_query: str
+    - docs: list[Document]
+    - answer: str
+    """
+    return (
+        # rephrase
+        REPHRASE_QUESTION_PROMPT
+        | rephrase_llm
+        | {"search_query": StrOutputParser()}
+        # retrieve
+        | {
+            "search_query": itemgetter("search_query"),
+            "docs": itemgetter("search_query") | retriever,
+            "question": itemgetter("search_query"),
+        }
+        # respond
+        | {
+            "search_query": itemgetter("search_query"),
+            "docs": itemgetter("docs"),
+            "answer": (
+                {
+                    "context": (lambda x: _combine_documents(x["docs"])),
+                    "question": itemgetter("question"),
+                }
+                | ANSWER_PROMPT
+                | answer_llm
+                | StrOutputParser()
+            ),
+        }
+    )
+db = get_faiss()
+retriever = db.as_retriever(
+    search_kwargs={"k": st.session_state.search_result_limit},
+)
+if st.session_state.anthropic_api_key:
+    rephrase_llm = ChatAnthropic(
+        model="claude-instant-v1"
+        if st.session_state.use_instant_for_rephrase
+        else "claude-2.1",
+        temperature=st.session_state.temperature,
+        max_tokens_to_sample=512,
+        anthropic_api_key=st.session_state.anthropic_api_key,
+    )
+    answer_llm = ChatAnthropic(
+        model="claude-2.1",
+        temperature=st.session_state.temperature,
+        max_tokens_to_sample=st.session_state.max_tokens,
+        anthropic_api_key=st.session_state.anthropic_api_key,
+    )
+    st.session_state.chain = get_chain(rephrase_llm, answer_llm, retriever)
+user_input = st.text_input(
+    "What are you looking for?",
+    value="",
+)
+doc_md = """## [{name}]({url})
+### Type
+{type}
+### Description
+{description}
+### Parent Service Description
+{parent_service_description}
+### Fields
+{fields}
+"""
+def display_docs(docs: list[Document]) -> None:
+    missing_value = ""
+    for doc in docs:
+        data = yaml.safe_load(doc.page_content)
+        st.markdown(f"## [{data['name']}]({data['url']})")
+        st.markdown(f"### Type\n{data['type']}")
+        st.markdown("### Description")
+        st.components.v1.html(data.get("description", missing_value))
+        st.markdown("### Parent Service Description")
+        st.components.v1.html(data.get("parent_service_description", missing_value))
+        if data.get("fields", None):
+            st.markdown("### Fields")
+            for field in data["fields"]:
+                st.markdown(f"- {field}")
+if user_input:
+    if st.session_state.chain is not None:
+        result = st.session_state.chain.invoke(dict(question=user_input))
+        st.markdown("# Query")
+        st.markdown(result["search_query"])
+        st.markdown("# Answer")
+        st.markdown(result["answer"])
+        st.markdown("# Documents")
+        display_docs(result["docs"])
+    else:
+        results = retriever.invoke(user_input)
+        display_docs(results)

bumpver.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[bumpver]
+current_version = "0.0.1"
+version_pattern = "MAJOR.MINOR.PATCH"
+commit_message = "bump version {old_version} -> {new_version}"
+tag_message = "{new_version}"
+tag_scope = "default"
+pre_commit_hook = ""
+post_commit_hook = ""
+commit = true
+tag = true
+push = true
+[bumpver.file_patterns]
+"bumpver.toml" = [
+    'current_version = "{version}"',
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+anthropic==0.7.5
+faiss-cpu==1.7.4
+huggingface-hub==0.19.4
+langchain==0.0.341
+langsmith==0.0.66
+openai==1.3.5
+pydantic==2.5.2
+PyYAML==6.0.1
+sentence-transformers==2.2.2
+streamlit==1.28.2
+torch==2.1.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html