speech-input

PreviousNext

Preview

Loading preview…
components/ui/speech-input.tsx
"use client"

import {
  Children,
  createContext,
  forwardRef,
  isValidElement,
  useCallback,
  useContext,
  useEffect,
  useRef,
  type ComponentPropsWithoutRef,
  type ReactNode,
} from "react"
import { cva, VariantProps } from "class-variance-authority"
import { motion } from "framer-motion"
import { MicIcon, SquareIcon, XIcon } from "lucide-react"

import { cn } from "@/lib/utils"
import {
  useScribe,
  type AudioFormat,
  type CommitStrategy,
} from "@/registry/elevenlabs-ui/hooks/use-scribe"
import { Button } from "@/components/ui/button"

const buttonVariants = cva("!px-0", {
  variants: {
    size: {
      default: "h-9 w-9",
      sm: "h-8 w-8",
      lg: "h-10 w-10",
    },
  },
  defaultVariants: {
    size: "default",
  },
})

// Context for sharing state between compound components
interface SpeechInputContextValue {
  isConnected: boolean
  isConnecting: boolean
  transcript: string
  partialTranscript: string
  committedTranscripts: string[]
  error: string | null
  start: () => Promise<void>
  stop: () => void
  cancel: () => void
  size: VariantProps<typeof buttonVariants>["size"]
}

const SpeechInputContext = createContext<SpeechInputContextValue | null>(null)

function useSpeechInput() {
  const context = useContext(SpeechInputContext)
  if (!context) {
    throw new Error(
      "SpeechInput compound components must be used within a SpeechInput"
    )
  }
  return context
}

// Root component
interface SpeechInputEvent {
  partialTranscript: string
  committedTranscripts: string[]
  transcript: string
}

interface SpeechInputProps {
  children: ReactNode
  getToken: () => Promise<string>
  onChange?: (event: SpeechInputEvent) => void
  onCancel?: (event: SpeechInputEvent) => void
  onStart?: (event: SpeechInputEvent) => void
  onStop?: (event: SpeechInputEvent) => void
  className?: string
  size?: VariantProps<typeof buttonVariants>["size"]

  // Connection options
  modelId?: string
  baseUri?: string

  // VAD options
  commitStrategy?: CommitStrategy
  vadSilenceThresholdSecs?: number
  vadThreshold?: number
  minSpeechDurationMs?: number
  minSilenceDurationMs?: number
  languageCode?: string

  // Microphone options (for automatic microphone mode)
  microphone?: {
    deviceId?: string
    echoCancellation?: boolean
    noiseSuppression?: boolean
    autoGainControl?: boolean
    channelCount?: number
  }

  // Manual audio options
  audioFormat?: AudioFormat
  sampleRate?: number

  // Error callbacks
  onError?: (error: Error | Event) => void
  onAuthError?: (data: { error: string }) => void
  onQuotaExceededError?: (data: { error: string }) => void
}

const buildTranscript = ({
  partialTranscript,
  committedTranscripts,
}: {
  partialTranscript: string
  committedTranscripts: string[]
}): string => {
  const committed = committedTranscripts.join(" ").trim()
  const partial = partialTranscript.trim()

  if (committed && partial) {
    return `${committed} ${partial}`
  }
  return committed || partial
}

const buildEvent = ({
  partialTranscript,
  committedTranscripts,
}: {
  partialTranscript: string
  committedTranscripts: string[]
}): SpeechInputEvent => {
  return {
    partialTranscript,
    committedTranscripts,
    transcript: buildTranscript({ partialTranscript, committedTranscripts }),
  }
}

const SpeechInput = forwardRef<HTMLDivElement, SpeechInputProps>(
  function SpeechInput(
    {
      children,
      getToken,
      onChange,
      onCancel,
      onStart,
      onStop,
      className,
      size = "default",
      modelId = "scribe_v2_realtime",
      baseUri,
      commitStrategy,
      vadSilenceThresholdSecs,
      vadThreshold,
      minSpeechDurationMs,
      minSilenceDurationMs,
      languageCode,
      microphone = {
        echoCancellation: true,
        noiseSuppression: true,
      },
      audioFormat,
      sampleRate,
      onError,
      onAuthError,
      onQuotaExceededError,
    },
    ref
  ) {
    const transcriptsRef = useRef({
      partialTranscript: "",
      committedTranscripts: [] as string[],
    })
    const startRequestIdRef = useRef(0)

    const scribe = useScribe({
      modelId,
      baseUri,
      commitStrategy,
      vadSilenceThresholdSecs,
      vadThreshold,
      minSpeechDurationMs,
      minSilenceDurationMs,
      languageCode,
      audioFormat,
      sampleRate,
      microphone,
      onPartialTranscript: (data) => {
        transcriptsRef.current.partialTranscript = data.text
        onChange?.(buildEvent(transcriptsRef.current))
      },
      onCommittedTranscript: (data) => {
        transcriptsRef.current.committedTranscripts.push(data.text)
        transcriptsRef.current.partialTranscript = ""
        onChange?.(buildEvent(transcriptsRef.current))
      },
      onError,
      onAuthError,
      onQuotaExceededError,
    })

    const isConnecting = scribe.status === "connecting"

    const start = useCallback(async () => {
      const requestId = startRequestIdRef.current + 1
      startRequestIdRef.current = requestId

      transcriptsRef.current = {
        partialTranscript: "",
        committedTranscripts: [],
      }
      scribe.clearTranscripts()

      try {
        const token = await getToken()
        if (startRequestIdRef.current !== requestId) {
          return
        }

        await scribe.connect({
          token,
        })
        if (startRequestIdRef.current !== requestId) {
          scribe.disconnect()
          return
        }
        onStart?.(buildEvent(transcriptsRef.current))
      } catch {
        // Error is handled by onError callback
      }
      // eslint-disable-next-line react-hooks/exhaustive-deps
    }, [getToken, scribe, onStart, microphone])

    const stop = () => {
      startRequestIdRef.current += 1
      scribe.disconnect()
      onStop?.(buildEvent(transcriptsRef.current))
    }

    const cancel = () => {
      startRequestIdRef.current += 1
      const event = buildEvent(transcriptsRef.current)
      scribe.disconnect()
      scribe.clearTranscripts()
      transcriptsRef.current = {
        partialTranscript: "",
        committedTranscripts: [],
      }
      onCancel?.(event)
    }

    const contextValue: SpeechInputContextValue = {
      isConnected: scribe.isConnected,
      isConnecting,
      start,
      stop,
      cancel,
      error: scribe.error,
      size,
      ...buildEvent({
        partialTranscript: scribe.partialTranscript,
        committedTranscripts: scribe.committedTranscripts.map((t) => t.text),
      }),
    }

    useEffect(() => {
      return () => {
        startRequestIdRef.current += 1
        scribe.disconnect()
      }
    }, [scribe.disconnect])

    return (
      <SpeechInputContext.Provider value={contextValue}>
        <div
          ref={ref}
          className={cn(
            "relative inline-flex items-center overflow-hidden rounded-lg border border-transparent transition-all duration-200",
            scribe.isConnected
              ? "bg-background dark:bg-muted border-input shadow-sm"
              : "",
            className
          )}
        >
          {children}
        </div>
      </SpeechInputContext.Provider>
    )
  }
)

// Record button - toggles between mic icon and stop icon
type SpeechInputRecordButtonProps = Omit<
  ComponentPropsWithoutRef<typeof Button>,
  "size"
>

const SpeechInputRecordButton = forwardRef<
  HTMLButtonElement,
  SpeechInputRecordButtonProps
>(function SpeechInputRecordButton(
  { className, onClick, variant = "ghost", disabled, ...props },
  ref
) {
  const speechInput = useSpeechInput()

  return (
    <Button
      ref={ref}
      variant={variant}
      onClick={(e) => {
        if (speechInput.isConnected) {
          speechInput.stop()
        } else {
          speechInput.start()
        }
        onClick?.(e)
      }}
      disabled={disabled ?? speechInput.isConnecting}
      className={cn(
        buttonVariants({ size: speechInput.size }),
        "relative flex flex-shrink-0 items-center justify-center transition-all",
        speechInput.isConnected && "scale-[80%]",
        className
      )}
      aria-label={
        speechInput.isConnected ? "Stop recording" : "Start recording"
      }
      {...props}
    >
      <div
        className={cn(
          "bg-primary absolute h-4 w-4 rounded-full transition-all duration-200",
          speechInput.isConnecting
            ? "scale-90 opacity-100"
            : "scale-[60%] opacity-0"
        )}
      />
      <SquareIcon
        className={cn(
          "text-destructive absolute h-4 w-4 fill-current transition-all duration-200",
          !speechInput.isConnecting && speechInput.isConnected
            ? "scale-100 opacity-100"
            : "scale-[60%] opacity-0"
        )}
      />
      <MicIcon
        className={cn(
          "absolute h-4 w-4 transition-all duration-200",
          !speechInput.isConnecting && !speechInput.isConnected
            ? "scale-100 opacity-100"
            : "scale-[60%] opacity-0"
        )}
      />
    </Button>
  )
})

// Preview - shows the current transcript with partial
type SpeechInputPreviewProps = ComponentPropsWithoutRef<"div"> & {
  placeholder?: string
}

const SpeechInputPreview = forwardRef<HTMLDivElement, SpeechInputPreviewProps>(
  function SpeechInputPreview(
    { className, placeholder = "Listening...", ...props },
    ref
  ) {
    const speechInput = useSpeechInput()

    const displayText =
      speechInput.transcript || speechInput.partialTranscript || placeholder
    const showPlaceholder = !speechInput.transcript.trim()

    return (
      <div
        ref={ref}
        // @ts-expect-error inert is not yet in React types
        inert={speechInput.isConnected ? undefined : ""}
        className={cn(
          "relative flex h-8 flex-shrink-0 items-center overflow-hidden text-sm transition-[opacity,transform,width] duration-200 ease-out",
          showPlaceholder
            ? "text-muted-foreground italic"
            : "text-muted-foreground [mask-image:linear-gradient(to_right,transparent,black_16px,black_calc(100%-16px),transparent)]",
          speechInput.isConnected ? "w-28 opacity-100" : "w-0 opacity-0",
          className
        )}
        title={displayText}
        aria-hidden={!speechInput.isConnected}
        {...props}
      >
        <motion.p
          key="text"
          layout="position"
          className={`absolute top-0 right-0 bottom-0 flex h-full min-w-full items-center px-0 whitespace-nowrap`}
        >
          {displayText}
        </motion.p>
      </div>
    )
  }
)

// Cancel button
type SpeechInputCancelButtonProps = Omit<
  ComponentPropsWithoutRef<typeof Button>,
  "size"
>

const SpeechInputCancelButton = forwardRef<
  HTMLButtonElement,
  SpeechInputCancelButtonProps
>(function SpeechInputCancelButton(
  { className, onClick, variant = "ghost", ...props },
  ref
) {
  const speechInput = useSpeechInput()

  return (
    <Button
      ref={ref}
      variant={variant}
      // @ts-expect-error inert is not yet in React types
      inert={speechInput.isConnected ? undefined : ""}
      onClick={(e) => {
        speechInput.cancel()
        onClick?.(e)
      }}
      className={cn(
        buttonVariants({ size: speechInput.size }),
        "flex-shrink-0 transition-[opacity,transform,width] duration-200 ease-out",
        speechInput.isConnected
          ? "scale-[80%] opacity-100"
          : "pointer-events-none w-0 scale-100 opacity-0",
        className
      )}
      aria-label="Cancel recording"
      {...props}
    >
      <XIcon className="h-3 w-3" />
    </Button>
  )
})

export {
  SpeechInput,
  SpeechInputRecordButton,
  SpeechInputPreview,
  SpeechInputCancelButton,
  useSpeechInput,
}

Installation

npx shadcn@latest add @elevenlabs-ui/speech-input

Usage

import { SpeechInput } from "@/components/ui/speech-input"
<SpeechInput />