seamless-streaming / streaming-react-app /src /StreamingInterface.tsx
m0wer's picture
Revert "Single User Mode (#21)"
66487d6
raw
history blame
44.3 kB
import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react';
import Button from '@mui/material/Button';
import Typography from '@mui/material/Typography';
import InputLabel from '@mui/material/InputLabel';
import FormControl from '@mui/material/FormControl';
import Select, {SelectChangeEvent} from '@mui/material/Select';
import MenuItem from '@mui/material/MenuItem';
import Stack from '@mui/material/Stack';
import seamlessLogoUrl from './assets/seamless.svg';
import {
AgentCapabilities,
BaseResponse,
BrowserAudioStreamConfig,
DynamicConfig,
PartialDynamicConfig,
SUPPORTED_INPUT_SOURCES,
SUPPORTED_OUTPUT_MODES,
ServerExceptionData,
ServerSpeechData,
ServerState,
ServerTextData,
StartStreamEventConfig,
StreamingStatus,
SupportedInputSource,
SupportedOutputMode,
TranslationSentences,
} from './types/StreamingTypes';
import FormLabel from '@mui/material/FormLabel';
import RadioGroup from '@mui/material/RadioGroup';
import FormControlLabel from '@mui/material/FormControlLabel';
import Radio from '@mui/material/Radio';
import './StreamingInterface.css';
import RoomConfig from './RoomConfig';
import Divider from '@mui/material/Divider';
import {useSocket} from './useSocket';
import {RoomState} from './types/RoomState';
import useStable from './useStable';
import float32To16BitPCM from './float32To16BitPCM';
import createBufferedSpeechPlayer from './createBufferedSpeechPlayer';
import Checkbox from '@mui/material/Checkbox';
import Alert from '@mui/material/Alert';
import isScrolledToDocumentBottom from './isScrolledToDocumentBottom';
import Box from '@mui/material/Box';
import Slider from '@mui/material/Slider';
import VolumeDown from '@mui/icons-material/VolumeDown';
import VolumeUp from '@mui/icons-material/VolumeUp';
import Mic from '@mui/icons-material/Mic';
import MicOff from '@mui/icons-material/MicOff';
import XRDialog from './react-xr/XRDialog';
import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData';
import {
sliceTranslationSentencesUpToIndex,
getTotalSentencesLength,
} from './sliceTranslationSentencesUtils';
import Blink from './Blink';
import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval';
import {getURLParams} from './URLParams';
import debug from './debug';
import DebugSection from './DebugSection';
import Switch from '@mui/material/Switch';
import Grid from '@mui/material/Grid';
import {getLanguageFromThreeLetterCode} from './languageLookup';
import HeadphonesIcon from '@mui/icons-material/Headphones';
const AUDIO_STREAM_DEFAULTS = {
userMedia: {
echoCancellation: false,
noiseSuppression: true,
},
displayMedia: {
echoCancellation: false,
noiseSuppression: false,
},
} as const;
async function requestUserMediaAudioStream(
config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['userMedia'],
) {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {...config, channelCount: 1},
});
console.debug(
'[requestUserMediaAudioStream] stream created with settings:',
stream.getAudioTracks()?.[0]?.getSettings(),
);
return stream;
}
async function requestDisplayMediaAudioStream(
config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['displayMedia'],
) {
const stream = await navigator.mediaDevices.getDisplayMedia({
audio: {...config, channelCount: 1},
});
console.debug(
'[requestDisplayMediaAudioStream] stream created with settings:',
stream.getAudioTracks()?.[0]?.getSettings(),
);
return stream;
}
const buttonLabelMap: {[key in StreamingStatus]: string} = {
stopped: 'Start Streaming',
running: 'Stop Streaming',
starting: 'Starting...',
};
const BUFFER_LIMIT = 1;
const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36;
const GAIN_MULTIPLIER_OVER_1 = 3;
const getGainScaledValue = (value) =>
value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value;
const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2;
const MAX_SERVER_EXCEPTIONS_TRACKED = 500;
export const TYPING_ANIMATION_DELAY_MS = 6;
export default function StreamingInterface() {
const urlParams = getURLParams();
const debugParam = urlParams.debug;
const [animateTextDisplay, setAnimateTextDisplay] = useState<boolean>(
urlParams.animateTextDisplay,
);
const socketObject = useSocket();
const {socket, clientID} = socketObject;
const [serverState, setServerState] = useState<ServerState | null>(null);
const [agent, setAgent] = useState<AgentCapabilities | null>(null);
const model = agent?.name ?? null;
const agentsCapabilities: Array<AgentCapabilities> =
serverState?.agentsCapabilities ?? [];
const currentAgent: AgentCapabilities | null =
agentsCapabilities.find((agent) => agent.name === model) ?? null;
const [serverExceptions, setServerExceptions] = useState<
Array<ServerExceptionData>
>([]);
const [roomState, setRoomState] = useState<RoomState | null>(null);
const roomID = roomState?.room_id ?? null;
const isSpeaker =
(clientID != null && roomState?.speakers.includes(clientID)) ?? false;
const isListener =
(clientID != null && roomState?.listeners.includes(clientID)) ?? false;
const [streamingStatus, setStreamingStatus] =
useState<StreamingStatus>('stopped');
const isStreamConfiguredRef = useRef<boolean>(false);
const [hasMaxSpeakers, setHasMaxSpeakers] = useState<boolean>(false);
const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
const [inputSource, setInputSource] =
useState<SupportedInputSource>('userMedia');
const [enableNoiseSuppression, setEnableNoiseSuppression] = useState<
boolean | null
>(null);
const [enableEchoCancellation, setEnableEchoCancellation] = useState<
boolean | null
>(null);
// Dynamic Params:
const [targetLang, setTargetLang] = useState<string | null>(null);
const [enableExpressive, setEnableExpressive] = useState<boolean | null>(
null,
);
const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
debugParam ?? false,
);
const [receivedData, setReceivedData] = useState<Array<ServerTextData>>([]);
const [
translationSentencesAnimatedIndex,
setTranslationSentencesAnimatedIndex,
] = useState<number>(0);
const lastTranslationResultRef = useRef<HTMLDivElement | null>(null);
const [inputStream, setInputStream] = useState<MediaStream | null>(null);
const [inputStreamSource, setInputStreamSource] =
useState<MediaStreamAudioSourceNode | null>(null);
const audioContext = useStable<AudioContext>(() => new AudioContext());
const [scriptNodeProcessor, setScriptNodeProcessor] =
useState<ScriptProcessorNode | null>(null);
const [muted, setMuted] = useState<boolean>(false);
// The onaudioprocess script needs an up-to-date reference to the muted state, so
// we use a ref here and keep it in sync via useEffect
const mutedRef = useRef<boolean>(muted);
useEffect(() => {
mutedRef.current = muted;
}, [muted]);
const [gain, setGain] = useState<number>(1);
const isScrolledToBottomRef = useRef<boolean>(isScrolledToDocumentBottom());
// Some config options must be set when starting streaming and cannot be chaned dynamically.
// This controls whether they are disabled or not
const streamFixedConfigOptionsDisabled =
streamingStatus !== 'stopped' || roomID == null;
const bufferedSpeechPlayer = useStable(() => {
const player = createBufferedSpeechPlayer({
onStarted: () => {
console.debug('📢 PLAYBACK STARTED 📢');
},
onEnded: () => {
console.debug('🛑 PLAYBACK ENDED 🛑');
},
});
// Start the player now so it eagerly plays audio when it arrives
player.start();
return player;
});
const translationSentencesBase: TranslationSentences =
getTranslationSentencesFromReceivedData(receivedData);
const translationSentencesBaseTotalLength = getTotalSentencesLength(
translationSentencesBase,
);
const translationSentences: TranslationSentences = animateTextDisplay
? sliceTranslationSentencesUpToIndex(
translationSentencesBase,
translationSentencesAnimatedIndex,
)
: translationSentencesBase;
// We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up
const translationSentencesWithEmptyStartingString =
streamingStatus === 'running' && translationSentences.length === 0
? ['']
: translationSentences;
/******************************************
* Event Handlers
******************************************/
const setAgentAndUpdateParams = useCallback(
(newAgent: AgentCapabilities | null) => {
setAgent((prevAgent) => {
if (prevAgent?.name !== newAgent?.name) {
setTargetLang(newAgent?.targetLangs[0] ?? null);
setEnableExpressive(null);
}
return newAgent;
});
},
[],
);
const onSetDynamicConfig = useCallback(
async (partialConfig: PartialDynamicConfig) => {
return new Promise<void>((resolve, reject) => {
if (socket == null) {
reject(new Error('[onSetDynamicConfig] socket is null '));
return;
}
socket.emit(
'set_dynamic_config',
partialConfig,
(result: BaseResponse) => {
console.log('[emit result: set_dynamic_config]', result);
if (result.status === 'ok') {
resolve();
} else {
reject();
}
},
);
});
},
[socket],
);
const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => {
return new Promise<void>((resolve, reject) => {
if (socket == null) {
reject(new Error('[configureStreamAsync] socket is null '));
return;
}
const modelName = agent?.name ?? null;
if (modelName == null) {
reject(new Error('[configureStreamAsync] modelName is null '));
return;
}
const config: StartStreamEventConfig = {
event: 'config',
rate: sampleRate,
model_name: modelName,
debug: serverDebugFlag,
// synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true
async_processing: true,
buffer_limit: BUFFER_LIMIT,
model_type: outputMode,
};
console.log('[configureStreamAsync] sending config', config);
socket.emit('configure_stream', config, (statusObject) => {
setHasMaxSpeakers(statusObject.message === 'max_speakers')
if (statusObject.status === 'ok') {
isStreamConfiguredRef.current = true;
console.debug(
'[configureStreamAsync] stream configured!',
statusObject,
);
resolve();
} else {
isStreamConfiguredRef.current = false;
reject(
new Error(
`[configureStreamAsync] configure_stream returned status: ${statusObject.status}`,
),
);
return;
}
});
});
};
const startStreaming = async () => {
if (streamingStatus !== 'stopped') {
console.warn(
`Attempting to start stream when status is ${streamingStatus}`,
);
return;
}
setStreamingStatus('starting');
if (audioContext.state === 'suspended') {
console.warn('audioContext was suspended! resuming...');
await audioContext.resume();
}
let stream: MediaStream | null = null;
try {
if (inputSource === 'userMedia') {
stream = await requestUserMediaAudioStream({
noiseSuppression:
enableNoiseSuppression ??
AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression,
echoCancellation:
enableEchoCancellation ??
AUDIO_STREAM_DEFAULTS['userMedia'].echoCancellation,
});
} else if (inputSource === 'displayMedia') {
stream = await requestDisplayMediaAudioStream({
noiseSuppression:
enableNoiseSuppression ??
AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression,
echoCancellation:
enableEchoCancellation ??
AUDIO_STREAM_DEFAULTS['displayMedia'].echoCancellation,
});
} else {
throw new Error(`Unsupported input source requested: ${inputSource}`);
}
setInputStream(stream);
} catch (e) {
console.error('[startStreaming] media stream request failed:', e);
setStreamingStatus('stopped');
return;
}
const mediaStreamSource = audioContext.createMediaStreamSource(stream);
setInputStreamSource(mediaStreamSource);
/**
* NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor), but
* which is easy and convenient for our purposes.
*
* Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor
*
* In an ideal world this would be migrated to something like this SO answer: https://stackoverflow.com/a/65448287
*/
const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1);
setScriptNodeProcessor(scriptProcessor);
scriptProcessor.onaudioprocess = (event) => {
if (isStreamConfiguredRef.current === false) {
console.debug('[onaudioprocess] stream is not configured yet!');
return;
}
if (socket == null) {
console.warn('[onaudioprocess] socket is null in onaudioprocess');
return;
}
if (mutedRef.current) {
// We still want to send audio to the server when we're muted to ensure we
// get any remaining audio back from the server, so let's pass an array length 1 with a value of 0
const mostlyEmptyInt16Array = new Int16Array(1);
socket.emit('incoming_audio', mostlyEmptyInt16Array);
} else {
const float32Audio = event.inputBuffer.getChannelData(0);
const pcm16Audio = float32To16BitPCM(float32Audio);
socket.emit('incoming_audio', pcm16Audio);
}
debug()?.sentAudio(event);
};
mediaStreamSource.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
bufferedSpeechPlayer.start();
try {
if (targetLang == null) {
throw new Error('[startStreaming] targetLang cannot be nullish');
}
// When we are starting the stream we want to pass all the dynamic config values
// available before actually configuring and starting the stream
const fullDynamicConfig: DynamicConfig = {
targetLanguage: targetLang,
expressive: enableExpressive,
};
await onSetDynamicConfig(fullDynamicConfig);
// NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why.
await configureStreamAsync({
sampleRate: audioContext.sampleRate,
});
} catch (e) {
console.error('configureStreamAsync failed', e);
setStreamingStatus('stopped');
return;
}
setStreamingStatus('running');
};
const stopStreaming = useCallback(async () => {
if (streamingStatus === 'stopped') {
console.warn(
`Attempting to stop stream when status is ${streamingStatus}`,
);
return;
}
// Stop the speech playback right away
bufferedSpeechPlayer.stop();
if (inputStreamSource == null || scriptNodeProcessor == null) {
console.error(
'inputStreamSource || scriptNodeProcessor is null in stopStreaming',
);
} else {
inputStreamSource.disconnect(scriptNodeProcessor);
scriptNodeProcessor.disconnect(audioContext.destination);
// Release the mic input so we stop showing the red recording icon in the browser
inputStream?.getTracks().forEach((track) => track.stop());
}
if (socket == null) {
console.warn('Unable to emit stop_stream because socket is null');
} else {
socket.emit('stop_stream', (result) => {
console.debug('[emit result: stop_stream]', result);
});
}
setStreamingStatus('stopped');
}, [
audioContext.destination,
bufferedSpeechPlayer,
inputStream,
inputStreamSource,
scriptNodeProcessor,
socket,
streamingStatus,
]);
const onClearTranscriptForAll = useCallback(() => {
if (socket != null) {
socket.emit('clear_transcript_for_all');
}
}, [socket]);
/******************************************
* Effects
******************************************/
useEffect(() => {
if (socket == null) {
return;
}
const onRoomStateUpdate = (roomState: RoomState) => {
setRoomState(roomState);
};
socket.on('room_state_update', onRoomStateUpdate);
return () => {
socket.off('room_state_update', onRoomStateUpdate);
};
}, [socket]);
useEffect(() => {
if (socket != null) {
const onTranslationText = (data: ServerTextData) => {
setReceivedData((prev) => [...prev, data]);
debug()?.receivedText(data.payload);
};
const onTranslationSpeech = (data: ServerSpeechData) => {
bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate);
};
socket.on('translation_text', onTranslationText);
socket.on('translation_speech', onTranslationSpeech);
return () => {
socket.off('translation_text', onTranslationText);
socket.off('translation_speech', onTranslationSpeech);
};
}
}, [bufferedSpeechPlayer, socket]);
useEffect(() => {
if (socket != null) {
const onServerStateUpdate = (newServerState: ServerState) => {
setServerState(newServerState);
// If a client creates a server lock, we want to stop streaming if we're not them
if (
newServerState.serverLock?.isActive === true &&
newServerState.serverLock?.clientID !== clientID &&
streamingStatus === 'running'
) {
stopStreaming();
}
const firstAgentNullable = newServerState.agentsCapabilities[0];
if (agent == null && firstAgentNullable != null) {
setAgentAndUpdateParams(firstAgentNullable);
}
};
socket.on('server_state_update', onServerStateUpdate);
return () => {
socket.off('server_state_update', onServerStateUpdate);
};
}
}, [
agent,
clientID,
setAgentAndUpdateParams,
socket,
stopStreaming,
streamingStatus,
]);
useEffect(() => {
if (socket != null) {
const onServerException = (
exceptionDataWithoutClientTime: ServerExceptionData,
) => {
const exceptionData = {
...exceptionDataWithoutClientTime,
timeStringClient: new Date(
exceptionDataWithoutClientTime['timeEpochMs'],
).toLocaleString(),
};
setServerExceptions((prev) =>
[exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED),
);
console.error(
`[server_exception] The server encountered an exception: ${exceptionData['message']}`,
exceptionData,
);
};
socket.on('server_exception', onServerException);
return () => {
socket.off('server_exception', onServerException);
};
}
}, [socket]);
useEffect(() => {
if (socket != null) {
const onClearTranscript = () => {
setReceivedData([]);
setTranslationSentencesAnimatedIndex(0);
};
socket.on('clear_transcript', onClearTranscript);
return () => {
socket.off('clear_transcript', onClearTranscript);
};
}
}, [socket]);
useEffect(() => {
const onScroll = () => {
if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) {
isScrolledToBottomRef.current = true;
return;
}
isScrolledToBottomRef.current = false;
return;
};
document.addEventListener('scroll', onScroll);
return () => {
document.removeEventListener('scroll', onScroll);
};
}, []);
useLayoutEffect(() => {
if (
lastTranslationResultRef.current != null &&
isScrolledToBottomRef.current
) {
// Scroll the div to the most recent entry
lastTranslationResultRef.current.scrollIntoView();
}
// Run the effect every time data is received, so that
// we scroll to the bottom even if we're just adding text to
// a pre-existing chunk
}, [receivedData]);
useEffect(() => {
if (!animateTextDisplay) {
return;
}
if (
translationSentencesAnimatedIndex < translationSentencesBaseTotalLength
) {
const timeout = setTimeout(() => {
setTranslationSentencesAnimatedIndex((prev) => prev + 1);
debug()?.startRenderText();
}, TYPING_ANIMATION_DELAY_MS);
return () => clearTimeout(timeout);
} else {
debug()?.endRenderText();
}
}, [
animateTextDisplay,
translationSentencesAnimatedIndex,
translationSentencesBaseTotalLength,
]);
/******************************************
* Sub-components
******************************************/
const volumeSliderNode = (
<Stack
spacing={2}
direction="row"
sx={{mb: 1, width: '100%'}}
alignItems="center">
<VolumeDown color="primary" />
<Slider
aria-label="Volume"
defaultValue={1}
scale={getGainScaledValue}
min={0}
max={3}
step={0.1}
marks={[
{value: 0, label: '0%'},
{value: 1, label: '100%'},
{value: 2, label: '400%'},
{value: 3, label: '700%'},
]}
valueLabelFormat={(value) => `${(value * 100).toFixed(0)}%`}
valueLabelDisplay="auto"
value={gain}
onChange={(_event: Event, newValue: number | number[]) => {
if (typeof newValue === 'number') {
const scaledGain = getGainScaledValue(newValue);
// We want the actual gain node to use the scaled value
bufferedSpeechPlayer.setGain(scaledGain);
// But we want react state to keep track of the non-scaled value
setGain(newValue);
} else {
console.error(
`[volume slider] Unexpected non-number value: ${newValue}`,
);
}
}}
/>
<VolumeUp color="primary" />
</Stack>
);
const xrDialogComponent = (
<XRDialog
animateTextDisplay={
animateTextDisplay &&
translationSentencesAnimatedIndex == translationSentencesBaseTotalLength
}
bufferedSpeechPlayer={bufferedSpeechPlayer}
translationSentences={translationSentences}
roomState={roomState}
roomID={roomID}
startStreaming={startStreaming}
stopStreaming={stopStreaming}
debugParam={debugParam}
onARHidden={() => {
setAnimateTextDisplay(urlParams.animateTextDisplay);
}}
onARVisible={() => setAnimateTextDisplay(false)}
/>
);
return (
<div className="app-wrapper-sra">
<Box
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore Not sure why it's complaining about complexity here
sx={{width: '100%', maxWidth: '660px', minWidth: '320px'}}>
<div className="main-container-sra">
<div className="top-section-sra horizontal-padding-sra">
<div className="header-container-sra">
<img
src={seamlessLogoUrl}
className="header-icon-sra"
alt="Seamless Translation Logo"
height={24}
width={24}
/>
<div>
<Typography variant="h1" sx={{color: '#65676B'}}>
Seamless Translation
</Typography>
</div>
</div>
<div className="header-container-sra">
<div>
<Typography variant="body2" sx={{color: '#65676B'}}>
Welcome! This space is limited to one speaker at a time.
If using the live HF space, sharing room code to listeners on another
IP address may not work because it's running on different replicas.
Use headphones if you are both speaker and listener to prevent feedback.
<br/>
If max speakers reached, please duplicate the space <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/spaces/facebook/seamless-streaming?duplicate=true">here</a>.
In your duplicated space, join a room as speaker or listener (or both),
and share the room code to invite listeners.
<br/>
Check out the seamless_communication <a target="_blank" rel="noopener noreferrer" href="https://github.com/facebookresearch/seamless_communication/tree/main">README</a> for more information.
<br/>
SeamlessStreaming model is a research model and is not released
for production deployment. It is important to use a microphone with
noise cancellation (for e.g. a smartphone), otherwise you may see model hallucination on noises.
It works best if you pause every couple of sentences, or you may wish adjust the VAD threshold
in the model config. The real-time performance will degrade
if you try streaming multiple speakers at the same time.
</Typography>
</div>
</div>
<Stack spacing="22px" direction="column">
<Box>
<RoomConfig
roomState={roomState}
serverState={serverState}
streamingStatus={streamingStatus}
onJoinRoomOrUpdateRoles={() => {
// If the user has switched from speaker to listener we need to tell the
// player to play eagerly, since currently the listener doesn't have any stop/start controls
bufferedSpeechPlayer.start();
}}
/>
{isListener && !isSpeaker && (
<Box
sx={{
paddingX: 6,
paddingBottom: 2,
marginY: 2,
display: 'flex',
flexDirection: 'column',
alignItems: 'center',
}}>
{volumeSliderNode}
</Box>
)}
</Box>
{isSpeaker && (
<>
<Divider />
<Stack spacing="12px" direction="column">
<FormLabel id="output-modes-radio-group-label">
Model
</FormLabel>
<FormControl
disabled={
streamFixedConfigOptionsDisabled ||
agentsCapabilities.length === 0
}
fullWidth
sx={{minWidth: '14em'}}>
<InputLabel id="model-selector-input-label">
Model
</InputLabel>
<Select
labelId="model-selector-input-label"
label="Model"
onChange={(e: SelectChangeEvent) => {
const newAgent =
agentsCapabilities.find(
(agent) => e.target.value === agent.name,
) ?? null;
if (newAgent == null) {
console.error(
'Unable to find agent with name',
e.target.value,
);
}
setAgentAndUpdateParams(newAgent);
}}
value={model ?? ''}>
{agentsCapabilities.map((agent) => (
<MenuItem value={agent.name} key={agent.name}>
{agent.name}
</MenuItem>
))}
</Select>
</FormControl>
</Stack>
<Stack spacing={0.5}>
<FormLabel id="output-modes-radio-group-label">
Output
</FormLabel>
<Box sx={{paddingTop: 2, paddingBottom: 1}}>
<FormControl fullWidth sx={{minWidth: '14em'}}>
<InputLabel id="target-selector-input-label">
Target Language
</InputLabel>
<Select
labelId="target-selector-input-label"
label="Target Language"
onChange={(e: SelectChangeEvent) => {
setTargetLang(e.target.value);
onSetDynamicConfig({
targetLanguage: e.target.value,
});
}}
value={targetLang ?? ''}>
{currentAgent?.targetLangs.map((langCode) => (
<MenuItem value={langCode} key={langCode}>
{getLanguageFromThreeLetterCode(langCode) != null
? `${getLanguageFromThreeLetterCode(
langCode,
)} (${langCode})`
: langCode}
</MenuItem>
))}
</Select>
</FormControl>
</Box>
<Grid container>
<Grid item xs={12} sm={4}>
<FormControl
disabled={streamFixedConfigOptionsDisabled}>
<RadioGroup
aria-labelledby="output-modes-radio-group-label"
value={outputMode}
onChange={(e) =>
setOutputMode(
e.target.value as SupportedOutputMode,
)
}
name="output-modes-radio-buttons-group">
{
// TODO: Use supported modalities from agentCapabilities
SUPPORTED_OUTPUT_MODES.map(({value, label}) => (
<FormControlLabel
key={value}
value={value}
control={<Radio />}
label={label}
/>
))
}
</RadioGroup>
</FormControl>
</Grid>
<Grid item xs={12} sm={8}>
<Stack
direction="column"
spacing={1}
alignItems="flex-start"
sx={{flexGrow: 1}}>
{currentAgent?.dynamicParams?.includes(
'expressive',
) && (
<FormControlLabel
control={
<Switch
checked={enableExpressive ?? false}
onChange={(
event: React.ChangeEvent<HTMLInputElement>,
) => {
const newValue = event.target.checked;
setEnableExpressive(newValue);
onSetDynamicConfig({
expressive: newValue,
});
}}
/>
}
label="Expressive"
/>
)}
{isListener && (
<Box
sx={{
flexGrow: 1,
paddingX: 1.5,
paddingY: 1.5,
width: '100%',
}}>
{volumeSliderNode}
</Box>
)}
</Stack>
</Grid>
</Grid>
</Stack>
<Stack
direction="row"
spacing={2}
justifyContent="space-between">
<Box sx={{flex: 1}}>
<FormControl disabled={streamFixedConfigOptionsDisabled}>
<FormLabel id="input-source-radio-group-label">
Input Source
</FormLabel>
<RadioGroup
aria-labelledby="input-source-radio-group-label"
value={inputSource}
onChange={(e: React.ChangeEvent<HTMLInputElement>) =>
setInputSource(
e.target.value as SupportedInputSource,
)
}
name="input-source-radio-buttons-group">
{SUPPORTED_INPUT_SOURCES.map(({label, value}) => (
<FormControlLabel
key={value}
value={value}
control={<Radio />}
label={label}
/>
))}
</RadioGroup>
</FormControl>
</Box>
<Box sx={{flex: 1, flexGrow: 2}}>
<FormControl disabled={streamFixedConfigOptionsDisabled}>
<FormLabel>Options</FormLabel>
<FormControlLabel
control={
<Checkbox
checked={
enableNoiseSuppression ??
AUDIO_STREAM_DEFAULTS[inputSource]
.noiseSuppression
}
onChange={(
event: React.ChangeEvent<HTMLInputElement>,
) =>
setEnableNoiseSuppression(event.target.checked)
}
/>
}
label="Noise Suppression"
/>
<FormControlLabel
control={
<Checkbox
checked={
enableEchoCancellation ??
AUDIO_STREAM_DEFAULTS[inputSource]
.echoCancellation
}
onChange={(
event: React.ChangeEvent<HTMLInputElement>,
) =>
setEnableEchoCancellation(event.target.checked)
}
/>
}
label="Echo Cancellation (not recommended)"
/>
<FormControlLabel
control={
<Checkbox
checked={serverDebugFlag}
onChange={(
event: React.ChangeEvent<HTMLInputElement>,
) => setServerDebugFlag(event.target.checked)}
/>
}
label="Enable Server Debugging"
/>
</FormControl>
</Box>
</Stack>
{isSpeaker &&
isListener &&
inputSource === 'userMedia' &&
!enableEchoCancellation &&
gain !== 0 && (
<div>
<Alert severity="warning" icon={<HeadphonesIcon />}>
Headphones required to prevent feedback.
</Alert>
</div>
)}
{isSpeaker && enableEchoCancellation && (
<div>
<Alert severity="warning">
We don't recommend using echo cancellation as it may
distort the input audio. If possible, use headphones and
disable echo cancellation instead.
</Alert>
</div>
)}
<Stack direction="row" spacing={2}>
{streamingStatus === 'stopped' ? (
<Button
variant="contained"
onClick={startStreaming}
disabled={
roomID == null ||
// Prevent users from starting streaming if there is a server lock with an active session
(serverState?.serverLock?.isActive === true &&
serverState.serverLock.clientID !== clientID)
}>
{buttonLabelMap[streamingStatus]}
</Button>
) : (
<Button
variant="contained"
color={
streamingStatus === 'running' ? 'error' : 'primary'
}
disabled={
streamingStatus === 'starting' || roomID == null
}
onClick={stopStreaming}>
{buttonLabelMap[streamingStatus]}
</Button>
)}
<Box>
<Button
variant="contained"
aria-label={muted ? 'Unmute' : 'Mute'}
color={muted ? 'info' : 'primary'}
onClick={() => setMuted((prev) => !prev)}
sx={{
borderRadius: 100,
paddingX: 0,
minWidth: '36px',
}}>
{muted ? <MicOff /> : <Mic />}
</Button>
</Box>
{roomID == null ? null : (
<Box
sx={{
flexGrow: 1,
display: 'flex',
justifyContent: 'flex-end',
}}>
{xrDialogComponent}
</Box>
)}
</Stack>
{serverExceptions.length > 0 && (
<div>
<Alert severity="error">
{`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
</Alert>
</div>
)}
{serverState != null && hasMaxSpeakers && (
<div>
<Alert severity="error">
{`Maximum number of speakers reached. Please try again at a later time.`}
</Alert>
</div>
)}
{serverState != null &&
serverState.totalActiveTranscoders >=
TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
<div>
<Alert severity="warning">
{`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
</Alert>
</div>
)}
{serverState?.serverLock != null &&
serverState.serverLock.clientID !== clientID && (
<div>
<Alert severity="warning">
{`The server is currently locked. Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
</Alert>
</div>
)}
</>
)}
</Stack>
{isListener && !isSpeaker && (
<Box sx={{marginBottom: 1, marginTop: 2}}>
{xrDialogComponent}
</Box>
)}
</div>
{debugParam && roomID != null && <DebugSection />}
<div className="translation-text-container-sra horizontal-padding-sra">
<Stack
direction="row"
spacing={2}
sx={{mb: '16px', alignItems: 'center'}}>
<Typography variant="h1" sx={{fontWeight: 700, flexGrow: 1}}>
Transcript
</Typography>
{isSpeaker && (
<Button
variant="text"
size="small"
onClick={onClearTranscriptForAll}>
Clear Transcript for All
</Button>
)}
</Stack>
<Stack direction="row">
<div className="translation-text-sra">
{translationSentencesWithEmptyStartingString.map(
(sentence, index, arr) => {
const isLast = index === arr.length - 1;
const maybeRef = isLast
? {ref: lastTranslationResultRef}
: {};
return (
<div className="text-chunk-sra" key={index} {...maybeRef}>
<Typography variant="body1">
{sentence}
{animateTextDisplay && isLast && (
<Blink
intervalMs={CURSOR_BLINK_INTERVAL_MS}
shouldBlink={
(roomState?.activeTranscoders ?? 0) > 0
}>
<Typography
component="span"
variant="body1"
sx={{
display: 'inline-block',
transform: 'scaleY(1.25) translateY(-1px)',
}}>
{'|'}
</Typography>
</Blink>
)}
</Typography>
</div>
);
},
)}
</div>
</Stack>
</div>
</div>
</Box>
</div>
);
}