genai-processors/examples/realtime_simple_cli.py at main · google-gemini/genai-processors · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Command Line Interface to run a simple live audio agent.

This agent uses the Gemini API with live/conversation.py to use a turn-based
Gemini model to interact with the user in a real-time conversation similar to
a Live API setup. This is audio only.

## Setup

To install the dependencies for this script, run:

```
pip install --upgrade pyaudio genai-processors google-genai termcolor \
    langchain-google-genai transformers google-cloud-texttospeech
```

Note that you need to activate the Google Text-to-Speech API in your Google
Cloud project (as well as speech to text).

Before running this script, ensure the `GOOGLE_API_KEY` and `GOOGLE_PROJECT_ID`
environment variables are set to the api-key you obtained from Google AI Studio.

Important: **Use headphones**. This script uses the system default audio
input and output, which often won't include echo cancellation. So to prevent
the model from interrupting itself it is important to use headphones.

## Run

To run the script:

```shell
python3 ./realtime_simple_cli.py
```
"""

import asyncio
import os
from typing import Sequence

from absl import app
from genai_processors import content_api
from genai_processors import context
from genai_processors import processor
from genai_processors.core import audio_io
from genai_processors.core import rate_limit_audio
from genai_processors.core import realtime
from genai_processors.core import speech_to_text
from genai_processors.core import text
from genai_processors.core import text_to_speech
from genai_processors.examples import models
import pyaudio

# You need to define the project id in the environment variables.
GOOGLE_PROJECT_ID = os.environ['GOOGLE_PROJECT_ID']

INSTRUCTION_PARTS = [
    'You are an agent that interacts with the user in a conversation. Make'
    ' the conversation lively and interesting for the user. You can make jokes,'
    ' explain interesting facts related to what you see and hear, predict what'
    ' could happen, judge some actions or reactions, etc. Respond to the'
    ' user in a few sentences maximum: keep it short and engaging. Avoid'
    ' long monologues. You can use Google search to add extra information to'
    ' the user questions or to come up with interesting news or facts.'
]


@processor.create_filter
def _filter_parts(part: content_api.ProcessorPart) -> bool:
  """Filters out parts that are not relevant to a conversation.

  Removes audio responses from previous turns (model only) and reserved
  substreams. This is needed when working with an LLM that takes text only.
  It is also recommended (but not required) for Audio-In models to avoid
  re-tokenizing the same audio again and to use the transcription as input
  instead.

  Args:
    part: the part to filter.

  Returns:
    True if this filter should be passed through, False if it is filtered out.
  """
  if context.is_reserved_substream(part.substream_name):
    return False
  # Filters out audio responses from previous turns (model only)
  if content_api.is_audio(part.mimetype) and part.role.lower() == 'model':
    return False
  return True


async def run_conversation() -> None:
  r"""Runs a simple conversation agent taking an audio stream as input.

  The audio input and output is connected to the local machine's default input
  and output devices.
  """

  # input processor = audio stream from the default input device/mic + STT
  # The STT processor is used to convert the audio stream to text. This will
  # be used to store the conversation history in the prompt.
  pya = pyaudio.PyAudio()
  input_processor = audio_io.PyAudioIn(pya) + speech_to_text.SpeechToText(
      project_id=GOOGLE_PROJECT_ID,
      with_interim_results=False,
  )

  # Main model that will be used to generate the response. Note that filter
  # before the genai model that will remove the audio parts and will make sure
  # only text is sent to the model.
  genai_processor = _filter_parts + models.turn_based_model(
      system_instruction=INSTRUCTION_PARTS
  )

  # TTS processor that will be used to convert the text response to audio. Note
  # the rate limit audio processor that will be used to stream back small audio
  # chunks to the client at the same rate as how they are played back. This is
  # needed to stop the audio when the user is speaking: the rate limit audio
  # processor will then stop and the audio will not be played anymore.
  tts = text_to_speech.TextToSpeech(
      project_id=GOOGLE_PROJECT_ID
  ) + rate_limit_audio.RateLimitAudio(
      sample_rate=24000,
      delay_other_parts=True,
  )

  # Plays the audio parts. This processor also handles interruptions and makes
  # sure the audio output stops when the user is speaking.
  play_output = audio_io.PyAudioOut(pya)

  # Creates an agent as:
  # mic -> speech to text -> text conversation -> text to speech -> play audio
  conversation_agent = (
      input_processor
      + realtime.LiveProcessor(turn_processor=genai_processor + tts)
      + play_output
  )
  prompt = 'USER (ctrl+D to end)> '
  await text.terminal_output(
      conversation_agent(text.terminal_input(prompt=prompt)), prompt=prompt
  )


def main(argv: Sequence[str]):
  del argv  # Unused.
  if not GOOGLE_PROJECT_ID:
    raise ValueError(
        'Project ID is not set. Define a GOOGLE_PROJECT_ID environment variable'
        ' obtained from your Cloud project.'
    )
  asyncio.run(run_conversation())


if __name__ == '__main__':
  app.run(main)