Skip to content

Commit ac68932

Browse files
authored
MLX delegate part 3 (#17829)
This is part 3 of the MLX delegate, which focusses on adding support for many GenAI models.
1 parent 8373587 commit ac68932

38 files changed

+5631
-210
lines changed

.github/workflows/mlx.yml

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ on:
1010
- .github/workflows/mlx.yml
1111
- backends/mlx/**
1212
- extension/llm/export/**
13+
- extension/audio/**
14+
- examples/models/parakeet/**
15+
- examples/models/voxtral_realtime/**
1316
workflow_dispatch:
1417

1518
permissions: {}
@@ -104,3 +107,370 @@ jobs:
104107
echo "::error::Too many test failures: $FAILED > $MAX_FAILURES"
105108
exit 1
106109
fi
110+
111+
test-mlx-parakeet:
112+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
113+
with:
114+
job-name: test-mlx-parakeet
115+
runner: macos-14-xlarge
116+
python-version: "3.12"
117+
submodules: recursive
118+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
119+
timeout: 90
120+
script: |
121+
set -eux
122+
123+
echo "::group::Install ExecuTorch"
124+
${CONDA_RUN} python install_executorch.py > /dev/null
125+
echo "::endgroup::"
126+
127+
echo "::group::Install Parakeet requirements"
128+
${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt
129+
echo "::endgroup::"
130+
131+
${CONDA_RUN} pip list
132+
133+
echo "::group::Export Parakeet"
134+
${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \
135+
--backend mlx \
136+
--dtype bf16 \
137+
--qlinear_encoder 4w \
138+
--qlinear_encoder_group_size 128 \
139+
--qlinear 4w \
140+
--qlinear_group_size 128 \
141+
--output-dir /tmp/parakeet_mlx
142+
echo "::endgroup::"
143+
144+
echo "::group::Build Parakeet MLX runner"
145+
${CONDA_RUN} make parakeet-mlx
146+
echo "::endgroup::"
147+
148+
echo "::group::Run Parakeet MLX runner"
149+
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
150+
OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \
151+
--model_path /tmp/parakeet_mlx/model.pte \
152+
--audio_path /tmp/test_audio.wav \
153+
--tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1)
154+
echo "Runner output:"
155+
echo "$OUTPUT"
156+
if echo "$OUTPUT" | grep -iq "Phoebe"; then
157+
echo "Success: 'Phoebe' found in output"
158+
else
159+
echo "Failed: Expected 'Phoebe' not found in output"
160+
exit 1
161+
fi
162+
echo "::endgroup::"
163+
164+
test-mlx-voxtral:
165+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
166+
secrets: inherit
167+
with:
168+
job-name: test-mlx-voxtral
169+
runner: macos-14-xlarge
170+
python-version: "3.12"
171+
submodules: recursive
172+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
173+
secrets-env: EXECUTORCH_HF_TOKEN
174+
timeout: 90
175+
script: |
176+
set -eux
177+
178+
echo "::group::Install ExecuTorch"
179+
${CONDA_RUN} python install_executorch.py > /dev/null
180+
echo "::endgroup::"
181+
182+
echo "::group::Install Voxtral requirements"
183+
${CONDA_RUN} pip install mistral_common librosa soundfile datasets
184+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
185+
${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
186+
echo "::endgroup::"
187+
188+
${CONDA_RUN} pip list
189+
190+
echo "::group::Export Voxtral"
191+
${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \
192+
--output-dir /tmp/voxtral_mlx \
193+
--dtype bf16 \
194+
--qlinear 4w
195+
echo "::endgroup::"
196+
197+
echo "::group::Build Voxtral MLX runner"
198+
${CONDA_RUN} make voxtral-mlx
199+
echo "::endgroup::"
200+
201+
echo "::group::Run Voxtral MLX runner"
202+
curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json
203+
curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav
204+
OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \
205+
--model_path /tmp/voxtral_mlx/model.pte \
206+
--tokenizer_path /tmp/tekken.json \
207+
--audio_path /tmp/test_audio.wav \
208+
--processor_path /tmp/voxtral_mlx/preprocessor.pte \
209+
--prompt "What is happening in this audio?" \
210+
--temperature 0 2>&1)
211+
echo "Runner output:"
212+
echo "$OUTPUT"
213+
if echo "$OUTPUT" | grep -iq "poem"; then
214+
echo "Success: 'poem' found in output"
215+
else
216+
echo "Failed: Expected 'poem' not found in output"
217+
exit 1
218+
fi
219+
echo "::endgroup::"
220+
221+
test-mlx-voxtral-realtime:
222+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
223+
secrets: inherit
224+
with:
225+
job-name: test-mlx-voxtral-realtime
226+
runner: macos-14-xlarge
227+
python-version: "3.12"
228+
submodules: recursive
229+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
230+
secrets-env: EXECUTORCH_HF_TOKEN
231+
timeout: 90
232+
script: |
233+
set -eux
234+
235+
echo "::group::Install ExecuTorch"
236+
${CONDA_RUN} python install_executorch.py > /dev/null
237+
echo "::endgroup::"
238+
239+
echo "::group::Install Voxtral Realtime requirements"
240+
${CONDA_RUN} pip install safetensors
241+
echo "::endgroup::"
242+
243+
${CONDA_RUN} pip list
244+
245+
echo "::group::Download model"
246+
HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602')"
247+
MODEL_PATH=$(HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
248+
echo "Model path: ${MODEL_PATH}"
249+
echo "::endgroup::"
250+
251+
echo "::group::Export preprocessor"
252+
${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
253+
--feature_size 128 \
254+
--streaming \
255+
--backend mlx \
256+
--output_file /tmp/voxtral_rt_mlx/preprocessor.pte
257+
echo "::endgroup::"
258+
259+
echo "::group::Export Voxtral Realtime (streaming)"
260+
${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
261+
--model-path "${MODEL_PATH}" \
262+
--backend mlx \
263+
--streaming \
264+
--output-dir /tmp/voxtral_rt_mlx \
265+
--qlinear-encoder 4w \
266+
--qlinear 4w \
267+
--qembedding 8w \
268+
--qembedding-group-size 128
269+
echo "::endgroup::"
270+
271+
echo "::group::Build Voxtral Realtime MLX runner"
272+
${CONDA_RUN} make voxtral_realtime-mlx
273+
echo "::endgroup::"
274+
275+
echo "::group::Run Voxtral Realtime MLX runner"
276+
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
277+
OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
278+
--model_path /tmp/voxtral_rt_mlx/model.pte \
279+
--tokenizer_path "${MODEL_PATH}/tekken.json" \
280+
--preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
281+
--audio_path /tmp/test_audio.wav \
282+
--streaming 2>&1)
283+
echo "Runner output:"
284+
echo "$OUTPUT"
285+
if echo "$OUTPUT" | grep -iq "Phoebe"; then
286+
echo "Success: 'Phoebe' found in output"
287+
else
288+
echo "Failed: Expected 'Phoebe' not found in output"
289+
exit 1
290+
fi
291+
echo "::endgroup::"
292+
293+
test-mlx-whisper:
294+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
295+
secrets: inherit
296+
with:
297+
job-name: test-mlx-whisper
298+
runner: macos-14-xlarge
299+
python-version: "3.12"
300+
submodules: recursive
301+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
302+
secrets-env: EXECUTORCH_HF_TOKEN
303+
timeout: 90
304+
script: |
305+
set -eux
306+
307+
echo "::group::Install ExecuTorch and configure MLX build"
308+
${CONDA_RUN} python install_executorch.py > /dev/null
309+
echo "::endgroup::"
310+
311+
echo "::group::Install Whisper requirements"
312+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
313+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
314+
${CONDA_RUN} pip install transformers soundfile datasets librosa
315+
echo "::endgroup::"
316+
317+
${CONDA_RUN} pip list
318+
319+
echo "::group::Export Whisper"
320+
${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \
321+
--model-id "openai/whisper-tiny" \
322+
--output-dir /tmp/whisper_mlx \
323+
--dtype bf16 \
324+
--qlinear 4w
325+
echo "::endgroup::"
326+
327+
echo "::group::Run Whisper inference"
328+
OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \
329+
--model-dir /tmp/whisper_mlx \
330+
--use-sample-audio 2>&1)
331+
echo "$OUTPUT"
332+
if echo "$OUTPUT" | grep -iq "Mr. Quilter"; then
333+
echo "Success: 'Mr. Quilter' found in transcription"
334+
else
335+
echo "Failed: Expected 'Mr. Quilter' not found in transcription"
336+
exit 1
337+
fi
338+
echo "::endgroup::"
339+
340+
341+
test-mlx-stories110m:
342+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
343+
with:
344+
job-name: test-mlx-stories110m
345+
runner: macos-14-xlarge
346+
python-version: "3.12"
347+
submodules: recursive
348+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
349+
timeout: 90
350+
script: |
351+
set -eux
352+
353+
echo "::group::Install ExecuTorch"
354+
${CONDA_RUN} python install_executorch.py > /dev/null
355+
echo "::endgroup::"
356+
357+
echo "::group::Install Llama requirements"
358+
${CONDA_RUN} sh examples/models/llama/install_requirements.sh
359+
echo "::endgroup::"
360+
361+
${CONDA_RUN} pip list
362+
363+
echo "::group::Build ExecuTorch with MLX delegate"
364+
${CONDA_RUN} cmake --workflow --preset mlx-release
365+
echo "::endgroup::"
366+
367+
echo "::group::Build Llama runner with MLX"
368+
pushd examples/models/llama
369+
${CONDA_RUN} cmake --workflow --preset llama-release
370+
popd
371+
echo "::endgroup::"
372+
373+
echo "::group::Download stories110M artifacts"
374+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
375+
curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
376+
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
377+
echo "::endgroup::"
378+
379+
echo "::group::Create tokenizer.bin"
380+
${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
381+
echo "::endgroup::"
382+
383+
echo "::group::Export stories110M with MLX backend via export_llama_lib"
384+
${CONDA_RUN} python -m extension.llm.export.export_llm \
385+
base.checkpoint=stories110M.pt \
386+
base.params=params.json \
387+
model.use_kv_cache=true \
388+
model.dtype_override=fp32 \
389+
backend.mlx.enabled=true \
390+
quantization.qmode=4w \
391+
quantization.group_size=32 \
392+
export.output_name=/tmp/stories110m_mlx.pte
393+
echo "::endgroup::"
394+
395+
echo "::group::Run inference with C++ llama runner"
396+
./cmake-out/examples/models/llama/llama_main \
397+
--model_path=/tmp/stories110m_mlx.pte \
398+
--tokenizer_path=tokenizer.bin \
399+
--prompt="Once upon a time," \
400+
--temperature=0 \
401+
--seq_len=10
402+
echo "::endgroup::"
403+
404+
test-mlx-llm:
405+
strategy:
406+
fail-fast: false
407+
matrix:
408+
model:
409+
- id: "unsloth/Llama-3.2-1B-Instruct"
410+
name: "llama-1b"
411+
- id: "unsloth/Qwen3-0.6B"
412+
name: "qwen3-0.6b"
413+
- id: "unsloth/gemma-3-1b-it"
414+
name: "gemma3-1b"
415+
use-custom: [false, true]
416+
qconfig: ["4w", "nvfp4"]
417+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
418+
secrets: inherit
419+
with:
420+
job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
421+
runner: macos-14-xlarge
422+
python-version: "3.12"
423+
submodules: recursive
424+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
425+
secrets-env: EXECUTORCH_HF_TOKEN
426+
timeout: 90
427+
script: |
428+
set -eux
429+
430+
MODEL_ID="${{ matrix.model.id }}"
431+
MODEL_NAME="${{ matrix.model.name }}"
432+
USE_CUSTOM="${{ matrix.use-custom }}"
433+
QCONFIG="${{ matrix.qconfig }}"
434+
435+
CUSTOM_ARGS=""
436+
if [ "${USE_CUSTOM}" = "true" ]; then
437+
CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
438+
fi
439+
440+
echo "::group::Install ExecuTorch and configure MLX build"
441+
${CONDA_RUN} python install_executorch.py > /dev/null
442+
${CONDA_RUN} cmake --preset mlx-release
443+
echo "::endgroup::"
444+
445+
echo "::group::Install LLM requirements"
446+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
447+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
448+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
449+
${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
450+
echo "::endgroup::"
451+
452+
${CONDA_RUN} pip list
453+
454+
echo "::group::Export ${MODEL_NAME}"
455+
${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
456+
--model-id "${MODEL_ID}" \
457+
--output /tmp/${MODEL_NAME}.pte \
458+
--qlinear ${QCONFIG} \
459+
--qembedding ${QCONFIG} \
460+
${CUSTOM_ARGS}
461+
echo "::endgroup::"
462+
463+
echo "::group::Run ${MODEL_NAME} inference"
464+
OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
465+
--pte /tmp/${MODEL_NAME}.pte \
466+
--model-id "${MODEL_ID}" \
467+
--prompt "What is the capital of France?" \
468+
--max-new-tokens 50 2>&1)
469+
echo "$OUTPUT"
470+
if echo "$OUTPUT" | grep -iq "Paris"; then
471+
echo "Success: 'Paris' found in output"
472+
else
473+
echo "Failed: Expected 'Paris' not found in output"
474+
exit 1
475+
fi
476+
echo "::endgroup::"

0 commit comments

Comments
 (0)