From dcc22168c1e74e4e9655b994a40b781f01f35415 Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Mac.fios-router.home>
Date: Thu, 5 Jun 2025 20:52:25 -0400
Subject: [PATCH 01/14] Moved the sources to the right

---
 docs/source/en/model_doc/moshi.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index 9302a9461959..357f326bc1f5 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -16,10 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Moshi
 
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>
 
 ## Overview

From 60dfcfa9d92e0284d4748a64a39707b4a6856f7f Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Mac.fios-router.home>
Date: Thu, 5 Jun 2025 20:56:59 -0400
Subject: [PATCH 02/14] small Changes

---
 docs/source/en/model_doc/moshi.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index 357f326bc1f5..ed6529526aa8 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -14,10 +14,6 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Moshi
-
-
-
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
             <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
@@ -26,6 +22,9 @@ rendered properly in your Markdown viewer.
     </div>
 </div>
 
+# Moshi
+
+
 ## Overview
 
 The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour.

From 8cb77db760e4cc68778a56ae6843f3d128074319 Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Mac.fios-router.home>
Date: Thu, 5 Jun 2025 23:20:07 -0400
Subject: [PATCH 03/14] Some Changes to moonshine

---
 docs/source/en/model_doc/moonshine.md | 20 ++++++++++++++------
 docs/source/en/model_doc/moshi.md     |  7 ++++---
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 2a4599e3d7e0..939f3d5a6984 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -14,15 +14,23 @@ rendered properly in your Markdown viewer.
 
 -->
 
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+          <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+          <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+          <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # Moonshine
 
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[Moonshine](https://huggingface.co/papers/2410.15608)
+
+
+
+
+You can find all the Moonshine checkpoints on the [Hub](https://huggingface.co/models?search=moonshine).
 
-## Overview
 
 The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
 ](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index ed6529526aa8..357f326bc1f5 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -14,6 +14,10 @@ rendered properly in your Markdown viewer.
 
 -->
 
+# Moshi
+
+
+
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
             <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
@@ -22,9 +26,6 @@ rendered properly in your Markdown viewer.
     </div>
 </div>
 
-# Moshi
-
-
 ## Overview
 
 The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour.

From 4b20babbf3386613e6f9506de05079da5547fe0f Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Sohams-MacBook-Air.local>
Date: Sat, 7 Jun 2025 17:02:02 -0400
Subject: [PATCH 04/14] Added the install to pipline

---
 docs/source/en/model_doc/moonshine.md | 61 ++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 939f3d5a6984..e349412fe769 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -24,12 +24,69 @@ rendered properly in your Markdown viewer.
 
 # Moonshine
 
-[Moonshine](https://huggingface.co/papers/2410.15608)
+[Moonshine](https://huggingface.co/papers/2410.15608) is a speech recognition model that is optimized for real-time transcription and voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE).
+
+Moonshine is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time.
+
+You can find all the Moonshine checkpoints on the [Hub](https://huggingface.co/models?search=moonshine).
+
+> [!TIP]
+> Click on the Moonshine models in the right sidebar for more examples of how to apply Moonshine to different speech recognition tasks.
+
+The example below demonstrates how to generate a transcription based on an audio file with [`Pipeline`] or the [`AutoModel`] class.
+
+
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+# uncomment to install ffmpeg which is needed to decode the audio file
+# !brew install ffmpeg
+
+from transformers import pipeline
+
+asr = pipeline("automatic-speech-recognition", model="UsefulSensors/moonshine-base")
+
+result = asr("path_to_audio_file")
+
+#Prints the transcription from the audio file
+print(result["text"])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+# uncomment to install rjieba which is needed for the tokenizer
+# !pip install rjieba
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+model = AutoModelForMaskedLM.from_pretrained(
+    "junnyu/roformer_chinese_base", torch_dtype=torch.float16
+)
+tokenizer = AutoTokenizer.from_pretrained("junnyu/roformer_chinese_base")
+
+input_ids = tokenizer("水在零度时会[MASK]", return_tensors="pt").to(model.device)
+outputs = model(**input_ids)
+decoded = tokenizer.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True)
+print(decoded)
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "水在零度时会[MASK]" | transformers-cli run --task fill-mask --model junnyu/roformer_chinese_base --device 0
+```
+
+</hfoption>
+</hfoptions>
 
 
 
 
-You can find all the Moonshine checkpoints on the [Hub](https://huggingface.co/models?search=moonshine).
 
 
 The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands

From 00875c514f76600f2a65721afad8e22f1d400a4b Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Sohams-MacBook-Air.local>
Date: Mon, 9 Jun 2025 22:12:49 -0400
Subject: [PATCH 05/14] updated the monshine model card

---
 docs/source/en/model_doc/moonshine.md | 48 +++++++--------------------
 1 file changed, 12 insertions(+), 36 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index e349412fe769..719fa0c3a186 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -58,56 +58,32 @@ print(result["text"])
 <hfoption id="AutoModel">
 
 ```py
-# uncomment to install rjieba which is needed for the tokenizer
-# !pip install rjieba
+# uncomment to install librosa which is used for audio and music anlaysis. It is used to preprocess the data.
+# !pip install librosa
 import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
-model = AutoModelForMaskedLM.from_pretrained(
-    "junnyu/roformer_chinese_base", torch_dtype=torch.float16
-)
-tokenizer = AutoTokenizer.from_pretrained("junnyu/roformer_chinese_base")
+processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
+model = AutoModelForSpeechSeq2Seq.from_pretrained("UsefulSensors/moonshine-tiny")
 
-input_ids = tokenizer("水在零度时会[MASK]", return_tensors="pt").to(model.device)
-outputs = model(**input_ids)
-decoded = tokenizer.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True)
-print(decoded)
-```
+audio_array, sr = librosa.load("pathToFile", sr=16000)
+inputs = processor(audio_array, return_tensors="pt", sampling_rate=16000)
 
-</hfoption>
-<hfoption id="transformers CLI">
+generated_ids = model.generate(**inputs, max_new_tokens=256)
+transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-```bash
-echo -e "水在零度时会[MASK]" | transformers-cli run --task fill-mask --model junnyu/roformer_chinese_base --device 0
+print(f"Transcription: '{transcription}'")
 ```
-
 </hfoption>
 </hfoptions>
 
-
-
-
-
-
-The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
-](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
-
-The abstract from the paper is the following:
-
-*This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.*
-
-Tips:
+## Notes
 
 - Moonshine improves upon Whisper's architecture:
   1. It uses SwiGLU activation instead of GELU in the decoder layers
   2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
 
-This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb).
-The original code can be found [here](https://github.com/usefulsensors/moonshine).
-
-## Resources
-
-- [Automatic speech recognition task guide](../tasks/asr)
+- A guide for automatic speech recognition can be found [here](../tasks/asr)
 
 ## MoonshineConfig
 

From 4d1550731ec885e7db6e67a65ae25f848f8367a3 Mon Sep 17 00:00:00 2001
From: SohamPrabhu <62270341+SohamPrabhu@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:40:23 -0400
Subject: [PATCH 06/14] Update docs/source/en/model_doc/moonshine.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/moonshine.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 719fa0c3a186..a84a98f80fac 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 # Moonshine
 
-[Moonshine](https://huggingface.co/papers/2410.15608) is a speech recognition model that is optimized for real-time transcription and voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE).
+[Moonshine](https://huggingface.co/papers/2410.15608) is an encoder-decoder speech recognition model optimized for real-time transcription and recognizing voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE) to handle speech with varying lengths without using padding. This improves efficiency during inference, making it ideal for resource-constrained devices.
 
 Moonshine is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time.
 

From 3fd67fcb0e7e4ce4a488de06f7875a4abd6ded5b Mon Sep 17 00:00:00 2001
From: SohamPrabhu <62270341+SohamPrabhu@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:41:29 -0400
Subject: [PATCH 07/14] Update docs/source/en/model_doc/moonshine.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/moonshine.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index a84a98f80fac..352b9728c396 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -26,7 +26,6 @@ rendered properly in your Markdown viewer.
 
 [Moonshine](https://huggingface.co/papers/2410.15608) is an encoder-decoder speech recognition model optimized for real-time transcription and recognizing voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE) to handle speech with varying lengths without using padding. This improves efficiency during inference, making it ideal for resource-constrained devices.
 
-Moonshine is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time.
 
 You can find all the Moonshine checkpoints on the [Hub](https://huggingface.co/models?search=moonshine).
 

From 932d0f7655c1c7990a3be5b0376b947e4e9ecce5 Mon Sep 17 00:00:00 2001
From: SohamPrabhu <62270341+SohamPrabhu@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:41:40 -0400
Subject: [PATCH 08/14] Update docs/source/en/model_doc/moonshine.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/moonshine.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 352b9728c396..ac4e5068680f 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 [Moonshine](https://huggingface.co/papers/2410.15608) is an encoder-decoder speech recognition model optimized for real-time transcription and recognizing voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE) to handle speech with varying lengths without using padding. This improves efficiency during inference, making it ideal for resource-constrained devices.
 
 
-You can find all the Moonshine checkpoints on the [Hub](https://huggingface.co/models?search=moonshine).
+You can find all the original Moonshine checkpoints under the [Useful Sensors](https://huggingface.co/UsefulSensors) organization.
 
 > [!TIP]
 > Click on the Moonshine models in the right sidebar for more examples of how to apply Moonshine to different speech recognition tasks.

From 1afcc86ae2a972afc3acd03c34c6f72012019bbf Mon Sep 17 00:00:00 2001
From: SohamPrabhu <62270341+SohamPrabhu@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:41:53 -0400
Subject: [PATCH 09/14] Update docs/source/en/model_doc/moonshine.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/moonshine.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index ac4e5068680f..a6b0929ebfea 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -32,7 +32,7 @@ You can find all the original Moonshine checkpoints under the [Useful Sensors](h
 > [!TIP]
 > Click on the Moonshine models in the right sidebar for more examples of how to apply Moonshine to different speech recognition tasks.
 
-The example below demonstrates how to generate a transcription based on an audio file with [`Pipeline`] or the [`AutoModel`] class.
+The example below demonstrates how to transcribe speech into text with [`Pipeline`] or the [`AutoModel`] class.
 
 
 

From 117a515d32329de68d5f4f90ff60758ca8326556 Mon Sep 17 00:00:00 2001
From: SohamPrabhu <62270341+SohamPrabhu@users.noreply.github.com>
Date: Tue, 10 Jun 2025 12:42:28 -0400
Subject: [PATCH 10/14] Update docs/source/en/model_doc/moonshine.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/model_doc/moonshine.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index a6b0929ebfea..106b89f94a47 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -76,7 +76,6 @@ print(f"Transcription: '{transcription}'")
 </hfoption>
 </hfoptions>
 
-## Notes
 
 - Moonshine improves upon Whisper's architecture:
   1. It uses SwiGLU activation instead of GELU in the decoder layers

From 07d5ca67d75bf9c79da2891a992b482a8cfb4fb3 Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Sohams-MacBook-Air.local>
Date: Tue, 10 Jun 2025 13:42:40 -0400
Subject: [PATCH 11/14] Updated Documentation According to changes

---
 docs/source/en/model_doc/moonshine.md | 60 ++++++++++++++++-----------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 106b89f94a47..fd07dbbbbee5 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -40,38 +40,50 @@ The example below demonstrates how to transcribe speech into text with [`Pipelin
 <hfoption id="Pipeline">
 
 ```py
-# uncomment to install ffmpeg which is needed to decode the audio file
-# !brew install ffmpeg
-
+import torch
 from transformers import pipeline
 
-asr = pipeline("automatic-speech-recognition", model="UsefulSensors/moonshine-base")
-
-result = asr("path_to_audio_file")
-
-#Prints the transcription from the audio file
-print(result["text"])
+pipeline = pipeline(
+    task="automatic-speech-recognition",
+    model="UsefulSensors/moonshine-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
 ```
 
 </hfoption>
 <hfoption id="AutoModel">
 
 ```py
-# uncomment to install librosa which is used for audio and music anlaysis. It is used to preprocess the data.
-# !pip install librosa
+# pip install datasets
 import torch
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-
-processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("UsefulSensors/moonshine-tiny")
-
-audio_array, sr = librosa.load("pathToFile", sr=16000)
-inputs = processor(audio_array, return_tensors="pt", sampling_rate=16000)
-
-generated_ids = model.generate(**inputs, max_new_tokens=256)
-transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-print(f"Transcription: '{transcription}'")
+from datasets import load_dataset
+from transformers import AutoProcessor, MoonshineForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained(
+    "UsefulSensors/moonshine-base",
+)
+model = MoonshineForConditionalGeneration.from_pretrained(
+    "UsefulSensors/moonshine-base",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+).to("cuda")
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", split="validation")
+audio_sample = ds[0]["audio"]
+
+input_features = processor(
+    audio_sample["array"],
+    sampling_rate=audio_sample["sampling_rate"],
+    return_tensors="pt"
+)
+input_features = input_features.to("cuda", dtype=torch.float16)
+
+predicted_ids = model.generate(**input_features, cache_implementation="static")
+transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+transcription[0]
 ```
 </hfoption>
 </hfoptions>
@@ -81,7 +93,7 @@ print(f"Transcription: '{transcription}'")
   1. It uses SwiGLU activation instead of GELU in the decoder layers
   2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
 
-- A guide for automatic speech recognition can be found [here](../tasks/asr)
+-- A guide for automatic speech recognition can be found [here](../tasks/asr)
 
 ## MoonshineConfig
 

From baf56f2c68881ed832e6dce7b07394eed8f7919c Mon Sep 17 00:00:00 2001
From: Your Name <sohamprabhu@Mac.fios-router.home>
Date: Wed, 11 Jun 2025 19:05:09 -0400
Subject: [PATCH 12/14] Fixed the model with the commits

---
 docs/source/en/model_doc/moonshine.md |  7 ++-----
 docs/source/en/model_doc/moshi.md     | 10 ++++------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index fd07dbbbbee5..82e84bc581b5 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -88,12 +88,9 @@ transcription[0]
 </hfoption>
 </hfoptions>
 
+## Resources
 
-- Moonshine improves upon Whisper's architecture:
-  1. It uses SwiGLU activation instead of GELU in the decoder layers
-  2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
-
--- A guide for automatic speech recognition can be found [here](../tasks/asr)
+- [Automatic speech recognition task guide](../tasks/asr)
 
 ## MoonshineConfig
 
diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index 357f326bc1f5..e70286ebf2e1 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -18,12 +18,10 @@ rendered properly in your Markdown viewer.
 
 
 
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview

From 2c094e92febd59e5e9d2ae75f861c4f54eb7a4e7 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 12 Jun 2025 08:58:31 -0700
Subject: [PATCH 13/14] Update moonshine.md

---
 docs/source/en/model_doc/moonshine.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 82e84bc581b5..4cd2eec774d4 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -26,7 +26,6 @@ rendered properly in your Markdown viewer.
 
 [Moonshine](https://huggingface.co/papers/2410.15608) is an encoder-decoder speech recognition model optimized for real-time transcription and recognizing voice command. Instead of using traditional absolute position embeddings, Moonshine uses Rotary Position Embedding (RoPE) to handle speech with varying lengths without using padding. This improves efficiency during inference, making it ideal for resource-constrained devices.
 
-
 You can find all the original Moonshine checkpoints under the [Useful Sensors](https://huggingface.co/UsefulSensors) organization.
 
 > [!TIP]
@@ -34,8 +33,6 @@ You can find all the original Moonshine checkpoints under the [Useful Sensors](h
 
 The example below demonstrates how to transcribe speech into text with [`Pipeline`] or the [`AutoModel`] class.
 
-
-
 <hfoptions id="usage">
 <hfoption id="Pipeline">
 
@@ -88,10 +85,6 @@ transcription[0]
 </hfoption>
 </hfoptions>
 
-## Resources
-
-- [Automatic speech recognition task guide](../tasks/asr)
-
 ## MoonshineConfig
 
 [[autodoc]] MoonshineConfig

From 2114c3033091bf78fda5ad6cd5c82ad7cdeae0d4 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 12 Jun 2025 08:59:56 -0700
Subject: [PATCH 14/14] Update moshi.md

---
 docs/source/en/model_doc/moshi.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index e70286ebf2e1..9302a9461959 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -16,8 +16,6 @@ rendered properly in your Markdown viewer.
 
 # Moshi
 
-
-
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">