Spaces:

spark-nlp
/

HubertForCTC

Sleeping

App Files Files Community

abdullahmubeen10 commited on Aug 8, 2024

Commit

a29ca40

verified ·

1 Parent(s): 4b40a01

Upload 15 files

Browse files

Files changed (15) hide show

.streamlit/config.toml +3 -0
Demo.py +133 -0
Dockerfile +70 -0
inputs/audio-1.flac +0 -0
inputs/audio-10.flac +0 -0
inputs/audio-2.flac +0 -0
inputs/audio-3.flac +0 -0
inputs/audio-4.flac +0 -0
inputs/audio-5.flac +0 -0
inputs/audio-6.flac +0 -0
inputs/audio-7.flac +0 -0
inputs/audio-8.flac +0 -0
inputs/audio-9.flac +0 -0
pages/Workflow & Model Overview.py +201 -0
requirements.txt +5 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[theme]
+base="light"
+primaryColor="#29B4E8"

Demo.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import streamlit as st
+import sparknlp
+import os
+import pandas as pd
+import librosa
+from sparknlp.base import *
+from sparknlp.common import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+from sparknlp.pretrained import PretrainedPipeline
+from pyspark.sql.types import *
+import pyspark.sql.functions as F
+# Page configuration
+st.set_page_config(
+    layout="wide",
+    initial_sidebar_state="auto"
+)
+# Custom CSS for styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 10px;
+            border-radius: 10px;
+            margin-top: 10px;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+    </style>
+""", unsafe_allow_html=True)
+@st.cache_resource
+def init_spark():
+    """Initialize Spark NLP."""
+    return sparknlp.start()
+@st.cache_resource
+def create_pipeline(model):
+    """Create a Spark NLP pipeline for audio processing."""
+    audio_assembler = AudioAssembler() \
+        .setInputCol("audio_content") \
+        .setOutputCol("audio_assembler")
+    speech_to_text = HubertForCTC \
+        .pretrained(model)\
+        .setInputCols("audio_assembler") \
+        .setOutputCol("text")
+    pipeline = Pipeline(stages=[
+        audio_assembler,
+        speech_to_text
+    ])
+    return pipeline
+def fit_data(pipeline, fed_data):
+    """Fit the data into the pipeline and return the transcription."""
+    data, sampling_rate = librosa.load(fed_data, sr=16000)
+    data = data.tolist()
+    spark_df = spark.createDataFrame([[data]], ["audio_content"])
+    model = pipeline.fit(spark_df)
+    lp = LightPipeline(model)
+    lp_result = lp.fullAnnotate(data)[0]
+    return lp_result
+def save_uploadedfile(uploadedfile, path):
+    """Save the uploaded file to the specified path."""
+    filepath = os.path.join(path, uploadedfile.name)
+    with open(filepath, "wb") as f:
+        if hasattr(uploadedfile, 'getbuffer'):
+            f.write(uploadedfile.getbuffer())
+        else:
+            f.write(uploadedfile.read())
+# Sidebar content
+model_list = ["asr_hubert_large_ls960"]
+model = st.sidebar.selectbox(
+    "Choose the pretrained model",
+    model_list,
+    help="For more info about the models visit: https://sparknlp.org/models"
+)
+# Main content
+st.markdown('<div class="main-title">Speech Recognition With HubertForCTC</div>', unsafe_allow_html=True)
+st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>HubertForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)
+# Reference notebook link in sidebar
+st.sidebar.markdown('Reference notebook:')
+st.sidebar.markdown("""
+    <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/17.0.Speech_Recognition.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
+    </a>
+""", unsafe_allow_html=True)
+# Load examples
+AUDIO_FILE_PATH = "inputs"
+audio_files = sorted(os.listdir(AUDIO_FILE_PATH))
+selected_audio = st.selectbox("Select an audio", audio_files)
+# Creating a simplified Python list of audio file types
+audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
+uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)
+if uploadedfile:
+    selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
+    save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
+elif selected_audio:
+    selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"
+# Audio playback and transcription
+st.subheader("Play Audio")
+with open(selected_audio, 'rb') as audio_file:
+    audio_bytes = audio_file.read()
+st.audio(audio_bytes)
+spark = init_spark()
+pipeline = create_pipeline(model)
+output = fit_data(pipeline, selected_audio)
+st.subheader(f"Transcription:")
+st.markdown(f"{(output['text'][0].result).title()}")

Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# Download base image ubuntu 18.04
+FROM ubuntu:18.04
+# Set environment variables
+ENV NB_USER jovyan
+ENV NB_UID 1000
+ENV HOME /home/${NB_USER}
+# Install required packages
+RUN apt-get update && apt-get install -y \
+    tar \
+    wget \
+    bash \
+    rsync \
+    gcc \
+    libfreetype6-dev \
+    libhdf5-serial-dev \
+    libpng-dev \
+    libzmq3-dev \
+    python3 \
+    python3-dev \
+    python3-pip \
+    unzip \
+    pkg-config \
+    software-properties-common \
+    graphviz \
+    openjdk-8-jdk \
+    ant \
+    ca-certificates-java \
+    && apt-get clean \
+    && update-ca-certificates -f;
+# Install Python 3.8 and pip
+RUN add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y python3.8 python3-pip \
+    && apt-get clean;
+# Set up JAVA_HOME
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
+RUN mkdir -p ${HOME} \
+    && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
+    && chown -R ${NB_UID}:${NB_UID} ${HOME}
+# Create a new user named "jovyan" with user ID 1000
+RUN useradd -m -u ${NB_UID} ${NB_USER}
+# Switch to the "jovyan" user
+USER ${NB_USER}
+# Set home and path variables for the user
+ENV HOME=/home/${NB_USER} \
+    PATH=/home/${NB_USER}/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR ${HOME}
+# Upgrade pip and install Python dependencies
+RUN python3.8 -m pip install --upgrade pip
+COPY requirements.txt /tmp/requirements.txt
+RUN python3.8 -m pip install -r /tmp/requirements.txt
+# Copy the application code into the container at /home/jovyan
+COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
+# Expose port for Streamlit
+EXPOSE 7860
+# Define the entry point for the container
+ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]

inputs/audio-1.flac ADDED Viewed

Binary file (112 kB). View file

inputs/audio-10.flac ADDED Viewed

Binary file (76 kB). View file

inputs/audio-2.flac ADDED Viewed

Binary file (49 kB). View file

inputs/audio-3.flac ADDED Viewed

Binary file (74 kB). View file

inputs/audio-4.flac ADDED Viewed

Binary file (113 kB). View file

inputs/audio-5.flac ADDED Viewed

Binary file (138 kB). View file

inputs/audio-6.flac ADDED Viewed

Binary file (36.5 kB). View file

inputs/audio-7.flac ADDED Viewed

Binary file (177 kB). View file

inputs/audio-8.flac ADDED Viewed

Binary file (94.3 kB). View file

inputs/audio-9.flac ADDED Viewed

Binary file (129 kB). View file

pages/Workflow & Model Overview.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import streamlit as st
+# Custom CSS for better styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .sub-title {
+            font-size: 24px;
+            color: #4A90E2;
+            margin-top: 20px;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 15px;
+            border-radius: 10px;
+            margin-top: 20px;
+        }
+        .section h2 {
+            font-size: 22px;
+            color: #4A90E2;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+        .link {
+            color: #4A90E2;
+            text-decoration: none;
+        }
+        .benchmark-table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 20px;
+        }
+        .benchmark-table th, .benchmark-table td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }
+        .benchmark-table th {
+            background-color: #4A90E2;
+            color: white;
+        }
+        .benchmark-table td {
+            background-color: #f2f2f2;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Main Title
+st.markdown('<div class="main-title">HuBERT for Speech Recognition</div>', unsafe_allow_html=True)
+# Introduction
+st.markdown("""
+<div class="section">
+    <p><strong>HuBERT</strong> (Hidden-Unit BERT) is a self-supervised speech representation model introduced in the paper <em>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</em> by Wei-Ning Hsu et al. It tackles challenges in speech representation by predicting hidden units derived from clustered speech features, enabling the model to learn acoustic and language representations from unsegmented and unannotated audio data.</p>
+</div>
+""", unsafe_allow_html=True)
+# Why, Where, and When to Use HuBERT
+st.markdown('<div class="sub-title">Why, Where, and When to Use HuBERT</div>', unsafe_allow_html=True)
+# Explanation Section
+st.markdown("""
+<div class="section">
+    <p><strong>HuBERT</strong> is particularly useful in scenarios where high-quality speech-to-text conversion is required and where there is a need for robust speech representation learning. The model’s design makes it suitable for tasks where data may be noisy or unannotated. Key use cases include:</p>
+</div>
+""", unsafe_allow_html=True)
+# Use Cases Section
+st.markdown('<div class="sub-title">Use Cases</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><strong>Noisy Environment Transcription:</strong> Ideal for transcribing speech in noisy or challenging audio environments, such as call centers or field recordings.</li>
+        <li><strong>Preprocessing for NLP Tasks:</strong> Converts spoken language into text for NLP tasks like sentiment analysis, topic modeling, or entity recognition.</li>
+        <li><strong>Audio Content Analysis:</strong> Efficiently analyzes large volumes of audio content, enabling keyword extraction and content summarization.</li>
+        <li><strong>Language Model Enhancement:</strong> Enhances language models by providing robust speech representations, improving accuracy in tasks like machine translation or voice-activated systems.</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# How to Use the Model
+st.markdown('<div class="sub-title">HuBERT Pipeline in Spark NLP</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>To use the HuBERT model in Spark NLP, follow the example code below. This code demonstrates how to assemble audio data and apply the HubertForCTC annotator to convert speech to text.</p>
+</div>
+""", unsafe_allow_html=True)
+st.code('''
+audio_assembler = AudioAssembler()\\
+  .setInputCol("audio_content")\\
+  .setOutputCol("audio_assembler")
+speech_to_text = HubertForCTC.pretrained("asr_hubert_large_ls960", "en")\\
+  .setInputCols("audio_assembler")\\
+  .setOutputCol("text")
+pipeline = Pipeline(stages=[
+  audio_assembler,
+  speech_to_text,
+])
+pipelineModel = pipeline.fit(audioDf)
+pipelineDF = pipelineModel.transform(audioDf)
+''', language='python')
+# Model Information
+st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <table class="benchmark-table">
+        <tr>
+            <th>Attribute</th>
+            <th>Description</th>
+        </tr>
+        <tr>
+            <td><strong>Model Name</strong></td>
+            <td>asr_hubert_large_ls960</td>
+        </tr>
+        <tr>
+            <td><strong>Compatibility</strong></td>
+            <td>Spark NLP 4.3.0+</td>
+        </tr>
+        <tr>
+            <td><strong>License</strong></td>
+            <td>Open Source</td>
+        </tr>
+        <tr>
+            <td><strong>Edition</strong></td>
+            <td>Official</td>
+        </tr>
+        <tr>
+            <td><strong>Input Labels</strong></td>
+            <td>[audio_assembler]</td>
+        </tr>
+        <tr>
+            <td><strong>Output Labels</strong></td>
+            <td>[text]</td>
+        </tr>
+        <tr>
+            <td><strong>Language</strong></td>
+            <td>en</td>
+        </tr>
+        <tr>
+            <td><strong>Size</strong></td>
+            <td>1.5 GB</td>
+        </tr>
+    </table>
+</div>
+""", unsafe_allow_html=True)
+# Data Source Section
+st.markdown('<div class="sub-title">Data Source</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>The HuBERT model is available on <a class="link" href="https://huggingface.co/facebook/hubert-large-ls960-ft" target="_blank">Hugging Face</a>. It was fine-tuned on 960 hours of Librispeech data and is optimized for 16kHz sampled speech audio. Ensure your input audio is sampled at the same rate for optimal performance.</p>
+</div>
+""", unsafe_allow_html=True)
+# Conclusion
+st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p><strong>HuBERT</strong> offers a powerful solution for self-supervised speech recognition, especially in challenging audio environments. Its ability to learn from unannotated data and predict masked speech units makes it a robust model for various speech-related tasks. Integrated into Spark NLP, HuBERT is ready for large-scale deployment, supporting a wide range of applications from transcription to feature extraction.</p>
+    <p>If you’re working on speech recognition projects that require resilience to noise and variability, HuBERT provides an advanced, scalable option.</p>
+</div>
+""", unsafe_allow_html=True)
+# References
+st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/2023/02/07/asr_hubert_large_ls960_en.html" target="_blank">HuBERT Model on Sparknlp</a></li>
+        <li><a class="link" href="https://huggingface.co/facebook/hubert-large-ls960-ft" target="_blank">HuBERT Model on Hugging Face</a></li>
+        <li><a class="link" href="https://github.com/pytorch/fairseq/tree/master/examples/hubert" target="_blank">HuBERT GitHub Repository</a></li>
+        <li><a class="link" href="https://arxiv.org/abs/2106.07447" target="_blank">HuBERT Paper on arXiv</a></li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# Community & Support
+st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
+        <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
+        <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
+        <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
+        <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+spark-nlp
+pyspark
+librosa
+pandas