github-actions[bot] commited on
Commit
9d0299c
·
1 Parent(s): f3e1ec9

Sync with https://github.com/mozilla-ai/document-to-podcast

Browse files
Files changed (2) hide show
  1. Dockerfile +14 -0
  2. app.py +232 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04
2
+
3
+ RUN pip3 install --no-cache-dir --upgrade pip
4
+ RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
5
+ RUN pip3 install document-to-podcast
6
+
7
+ RUN groupadd --gid 1000 appuser \
8
+ && useradd --uid 1000 --gid 1000 --create-home appuser \
9
+ && chown -R appuser:appuser /home/appuser
10
+
11
+ USER appuser
12
+
13
+ EXPOSE 8501
14
+ ENTRYPOINT ["python3", "app.py"]
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit app for converting documents to podcasts."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import streamlit as st
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ from requests.exceptions import RequestException
12
+
13
+ from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
14
+ from document_to_podcast.inference.model_loaders import (
15
+ load_llama_cpp_model,
16
+ load_outetts_model,
17
+ )
18
+ from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
19
+ from document_to_podcast.inference.text_to_speech import text_to_speech
20
+ from document_to_podcast.inference.text_to_text import text_to_text_stream
21
+
22
+
23
+ @st.cache_resource
24
+ def load_text_to_text_model():
25
+ return load_llama_cpp_model(
26
+ model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
27
+ )
28
+
29
+
30
+ @st.cache_resource
31
+ def load_text_to_speech_model():
32
+ return load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
33
+
34
+
35
+ script = "script"
36
+ audio = "audio"
37
+ gen_button = "generate podcast button"
38
+ if script not in st.session_state:
39
+ st.session_state[script] = ""
40
+ if audio not in st.session_state:
41
+ st.session_state.audio = []
42
+ if gen_button not in st.session_state:
43
+ st.session_state[gen_button] = False
44
+
45
+
46
+ def gen_button_clicked():
47
+ st.session_state[gen_button] = True
48
+
49
+
50
+ st.title("Document To Podcast")
51
+
52
+ st.header("Upload a File")
53
+
54
+ uploaded_file = st.file_uploader(
55
+ "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
56
+ )
57
+
58
+ if uploaded_file is not None:
59
+ st.divider()
60
+ st.header("Loading and Cleaning Data")
61
+ st.markdown(
62
+ "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
63
+ )
64
+ st.divider()
65
+
66
+ extension = Path(uploaded_file.name).suffix
67
+
68
+ col1, col2 = st.columns(2)
69
+
70
+ raw_text = DATA_LOADERS[extension](uploaded_file)
71
+ with col1:
72
+ st.subheader("Raw Text")
73
+ st.text_area(
74
+ f"Number of characters before cleaning: {len(raw_text)}",
75
+ f"{raw_text[:500]} . . .",
76
+ )
77
+
78
+ clean_text = DATA_CLEANERS[extension](raw_text)
79
+ with col2:
80
+ st.subheader("Cleaned Text")
81
+ st.text_area(
82
+ f"Number of characters after cleaning: {len(clean_text)}",
83
+ f"{clean_text[:500]} . . .",
84
+ )
85
+ st.session_state["clean_text"] = clean_text
86
+
87
+ st.divider()
88
+
89
+ st.header("Or Enter a Website URL")
90
+ url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
91
+ process_url = st.button("Clean URL Content")
92
+
93
+
94
+ def process_url_content(url: str) -> tuple[str, str]:
95
+ """Fetch and clean content from a URL.
96
+
97
+ Args:
98
+ url: The URL to fetch content from
99
+
100
+ Returns:
101
+ tuple containing raw and cleaned text
102
+ """
103
+ response = requests.get(url)
104
+ response.raise_for_status()
105
+ soup = BeautifulSoup(response.text, "html.parser")
106
+ raw_text = soup.get_text()
107
+ return raw_text, DATA_CLEANERS[".html"](raw_text)
108
+
109
+
110
+ if url and process_url:
111
+ try:
112
+ with st.spinner("Fetching and cleaning content..."):
113
+ raw_text, clean_text = process_url_content(url)
114
+ st.session_state["clean_text"] = clean_text
115
+
116
+ # Display results
117
+ col1, col2 = st.columns(2)
118
+ with col1:
119
+ st.subheader("Raw Text")
120
+ st.text_area(
121
+ "Number of characters before cleaning: " f"{len(raw_text)}",
122
+ f"{raw_text[:500]}...",
123
+ )
124
+ with col2:
125
+ st.subheader("Cleaned Text")
126
+ st.text_area(
127
+ "Number of characters after cleaning: " f"{len(clean_text)}",
128
+ f"{clean_text[:500]}...",
129
+ )
130
+ except RequestException as e:
131
+ st.error(f"Error fetching URL: {str(e)}")
132
+ except Exception as e:
133
+ st.error(f"Error processing content: {str(e)}")
134
+
135
+ # Second part - Podcast generation
136
+ if "clean_text" in st.session_state:
137
+ clean_text = st.session_state["clean_text"]
138
+
139
+ st.divider()
140
+ st.header("Downloading and Loading models")
141
+ st.markdown(
142
+ "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
143
+ )
144
+ st.divider()
145
+
146
+ # Load models
147
+ text_model = load_text_to_text_model()
148
+ speech_model = load_text_to_speech_model()
149
+
150
+ st.markdown(
151
+ "For this demo, we are using the following models: \n"
152
+ "- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
153
+ "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
154
+ )
155
+ st.markdown(
156
+ "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
157
+ " for more information on how to use different models."
158
+ )
159
+
160
+ # ~4 characters per token is considered a reasonable default.
161
+ max_characters = text_model.n_ctx() * 4
162
+ if len(clean_text) > max_characters:
163
+ st.warning(
164
+ f"Input text is too big ({len(clean_text)})."
165
+ f" Using only a subset of it ({max_characters})."
166
+ )
167
+ clean_text = clean_text[:max_characters]
168
+
169
+ st.divider()
170
+ st.header("Podcast generation")
171
+ st.markdown(
172
+ "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
173
+ )
174
+ st.divider()
175
+
176
+ st.subheader("Speaker configuration")
177
+ for s in DEFAULT_SPEAKERS:
178
+ s.pop("id", None)
179
+ speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic")
180
+
181
+ if st.button("Generate Podcast", on_click=gen_button_clicked):
182
+ for n, speaker in enumerate(speakers):
183
+ speaker["id"] = n + 1
184
+ speakers_str = "\n".join(
185
+ str(Speaker.model_validate(speaker))
186
+ for speaker in speakers
187
+ if all(
188
+ speaker.get(x, None) for x in ["name", "description", "voice_profile"]
189
+ )
190
+ )
191
+ system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
192
+ with st.spinner("Generating Podcast..."):
193
+ text = ""
194
+ for chunk in text_to_text_stream(
195
+ clean_text, text_model, system_prompt=system_prompt.strip()
196
+ ):
197
+ text += chunk
198
+ if text.endswith("\n") and "Speaker" in text:
199
+ st.session_state.script += text
200
+ st.write(text)
201
+
202
+ speaker_id = re.search(r"Speaker (\d+)", text).group(1)
203
+ voice_profile = next(
204
+ speaker["voice_profile"]
205
+ for speaker in speakers
206
+ if speaker["id"] == int(speaker_id)
207
+ )
208
+ with st.spinner("Generating Audio..."):
209
+ speech = text_to_speech(
210
+ text.split(f'"Speaker {speaker_id}":')[-1],
211
+ speech_model,
212
+ voice_profile,
213
+ )
214
+ st.audio(speech, sample_rate=speech_model.audio_codec.sr)
215
+ st.session_state.audio.append(speech)
216
+ text = ""
217
+
218
+ if st.session_state[gen_button]:
219
+ if st.button("Save Podcast to audio file"):
220
+ st.session_state.audio = np.concatenate(st.session_state.audio)
221
+ sf.write(
222
+ "podcast.wav",
223
+ st.session_state.audio,
224
+ samplerate=speech_model.audio_codec.sr,
225
+ )
226
+ st.markdown("Podcast saved to disk!")
227
+
228
+ if st.button("Save Podcast script to text file"):
229
+ with open("script.txt", "w") as f:
230
+ st.session_state.script += "}"
231
+ f.write(st.session_state.script)
232
+ st.markdown("Script saved to disk!")