Added DeepSpeech RecognitionService

2024-11-22 01:16:23 +04:00 · 2020-11-20 22:25:38 +01:00 · 2020-11-20 22:25:38 +01:00 · 920f3468b2
commit 920f3468b2
parent ea2d551bed
13 changed files with 592 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,4 +8,6 @@ build
 .project

 # JDT-specific (Eclipse Java Development Tools)     
-.classpath
+.classpath
+
+.vscode
--- a/app/assets.xml
+++ b/app/assets.xml
@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="assets">
+  <property name="assets.list.name" value="assets.lst"/>
+  <property name="assets.dir" value="src/main/assets/sync"/>
+  <property name="assets.hash.type" value="md5"/>
+  <property name="assets.ctl.files"
+    value="**/*.${assets.hash.type},${assets.list.name}"/>
+
+  <fileset id="assets" dir="${assets.dir}" excludes="${assets.ctl.files}"/>
+
+  <target name="clean_assets">
+    <delete>
+      <fileset dir="${assets.dir}" includes="${assets.ctl.files}"/>
+    </delete>
+  </target>
+
+  <target name="list">
+    <pathconvert
+      dirsep="/" pathsep="${line.separator}"
+      refid="assets" property="asset.list">
+      <map from="${basedir}/${assets.dir}/" to=""/>
+    </pathconvert>
+    <echo message="${asset.list}" file="${assets.dir}/${assets.list.name}"/>
+  </target>
+
+  <target name="checksum">
+    <checksum algorithm="${assets.hash.type}">
+      <fileset refid="assets"/>
+    </checksum>
+  </target>
+</project>
--- a/app/build.gradle
+++ b/app/build.gradle
@ -5,6 +5,9 @@ repositories {
    maven {
        url  "https://dl.bintray.com/alphacep/vosk"
    }
+    maven {
+        url "https://jitpack.io"
+    }
 }

 android {
@ -31,4 +34,10 @@ dependencies {
    implementation 'com.alphacep:vosk-android:0.3.15'
    implementation 'androidx.appcompat:appcompat:1.2.0'
    implementation 'com.google.code.gson:gson:2.8.6'
-}
+    implementation 'org.mozilla.deepspeech:libdeepspeech:0.8.2'
+    implementation 'com.github.gkonovalov:android-vad:1.0.0'
+}
+
+ant.importBuild 'assets.xml'
+preBuild.dependsOn(list, checksum)
+clean.dependsOn(clean_assets)
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@ -27,5 +27,23 @@
                android:name="android.speech"
                android:resource="@xml/recognition_service" />
        </service>
+
+        <service
+            android:name=".DeepSpeechRecognitionService"
+            android:icon="@drawable/ic_service_trigger"
+            android:label="@string/deepspeech_recognition_service"
+            android:permission="android.permission.RECORD_AUDIO">
+            <intent-filter>
+
+                <!-- The constant value is defined at RecognitionService.SERVICE_INTERFACE. -->
+                <action android:name="android.speech.RecognitionService" />
+
+                <category android:name="android.intent.category.DEFAULT" />
+            </intent-filter>
+
+            <meta-data
+                android:name="android.speech"
+                android:resource="@xml/recognition_service" />
+        </service>
    </application>
 </manifest>
--- a/app/src/main/assets/sync/assets.lst
+++ b/app/src/main/assets/sync/assets.lst
@ -0,0 +1,26 @@
+deepspeech-catala/kenlm.scorer
+deepspeech-catala/model.tflite
+vosk-catala/README
+vosk-catala/am/final.mdl
+vosk-catala/am/tree
+vosk-catala/conf/mfcc.conf
+vosk-catala/conf/model.conf
+vosk-catala/graph/Gr.fst
+vosk-catala/graph/HCLr.fst
+vosk-catala/graph/disambig_tid.int
+vosk-catala/graph/phones/align_lexicon.int
+vosk-catala/graph/phones/align_lexicon.txt
+vosk-catala/graph/phones/disambig.int
+vosk-catala/graph/phones/disambig.txt
+vosk-catala/graph/phones/optional_silence.csl
+vosk-catala/graph/phones/optional_silence.int
+vosk-catala/graph/phones/optional_silence.txt
+vosk-catala/graph/phones/silence.csl
+vosk-catala/graph/phones/word_boundary.int
+vosk-catala/graph/phones/word_boundary.txt
+vosk-catala/ivector/final.dubm
+vosk-catala/ivector/final.ie
+vosk-catala/ivector/final.mat
+vosk-catala/ivector/global_cmvn.stats
+vosk-catala/ivector/online_cmvn.conf
+vosk-catala/ivector/splice.conf
--- a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer
+++ b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer
--- a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5
+++ b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5
@ -0,0 +1 @@
+d562825f02f2ba36cbd0a75a17e84e8d
--- a/app/src/main/assets/sync/deepspeech-catala/model.tflite
+++ b/app/src/main/assets/sync/deepspeech-catala/model.tflite
--- a/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5
+++ b/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5
@ -0,0 +1 @@
+735b1327dc3c00af256af64be33cbed3
--- a/app/src/main/assets/sync/vosk-catala/README.md5
+++ b/app/src/main/assets/sync/vosk-catala/README.md5
@ -0,0 +1 @@
+f49442fa8c9e15bfbb6379c788b3104f
--- a/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java
+++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java
@ -0,0 +1,176 @@
+package cat.oreilly.localstt;
+
+import android.content.Intent;
+import android.os.AsyncTask;
+import android.os.Bundle;
+import android.os.RemoteException;
+import android.speech.RecognitionService;
+import android.util.Log;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+
+import org.kaldi.Assets;
+import org.kaldi.RecognitionListener;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
+
+import java.io.File;
+import java.util.Map;
+import java.util.ArrayList;
+import java.io.IOException;
+
+public class DeepSpeechRecognitionService extends RecognitionService implements RecognitionListener {
+    private final static String TAG = DeepSpeechRecognitionService.class.getSimpleName();
+    private DeepSpeechModel model;
+    private DeepSpeechService speechService;
+
+    private RecognitionService.Callback mCallback;
+
+    @Override
+    protected void onStartListening(Intent intent, Callback callback) {
+        mCallback = callback;
+        Log.i(TAG, "onStartListening");
+        runRecognizerSetup(intent);
+    }
+
+    @Override
+    protected void onCancel(Callback callback) {
+        Log.i(TAG, "onCancel");
+        results(new Bundle(), true);
+    }
+
+    @Override
+    protected void onStopListening(Callback callback) {
+        Log.i(TAG, "onStopListening");
+        results(new Bundle(), true);
+    }
+
+    private void runRecognizerSetup(final Intent intent) {
+        new AsyncTask<Void, Void, Exception>() {
+            @Override
+            protected Exception doInBackground(Void... params) {
+                try {
+                    Assets assets = new Assets(DeepSpeechRecognitionService.this);
+                    File assetDir = assets.syncAssets();
+
+                    model = new DeepSpeechModel(assetDir.toString() + "/deepspeech-catala/model.tflite");
+                    model.enableExternalScorer(assetDir.toString() + "/deepspeech-catala/kenlm.scorer");
+
+                    setupRecognizer();
+                } catch (IOException e) {
+                    return e;
+                }
+                return null;
+            }
+
+            @Override
+            protected void onPostExecute(Exception result) {
+                if (result != null) {
+                    Log.e(TAG, "Failed to init recognizer " + result);
+                    error(android.speech.SpeechRecognizer.ERROR_CLIENT);
+                } else {
+                    readyForSpeech(new Bundle());
+                    beginningOfSpeech();
+
+                }
+            }
+        }.execute();
+    }
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+
+        if (speechService != null) {
+            speechService.cancel();
+            speechService.shutdown();
+        }
+    }
+
+    private void setupRecognizer() throws IOException {
+        try {
+            Log.i(TAG, "Setting up recognizer");
+            DeepSpeechService speechService = new DeepSpeechService(this.model, 16000.0f);
+            speechService.addListener(this);
+            speechService.startListening();
+        } catch (IOException e) {
+            Log.e(TAG, e.getMessage());
+        }
+    }
+
+    private void readyForSpeech(Bundle bundle) {
+        try {
+            mCallback.readyForSpeech(bundle);
+        } catch (RemoteException e) {
+            // empty
+        }
+    }
+
+    private void results(Bundle bundle, boolean isFinal) {
+        if (speechService != null) {
+            speechService.cancel();
+        }
+        try {
+            if (isFinal) {
+                mCallback.results(bundle);
+            } else {
+                mCallback.partialResults(bundle);
+            }
+        } catch (RemoteException e) {
+            // empty
+        }
+    }
+
+    private Bundle createResultsBundle(String hypothesis) {
+        ArrayList<String> hypotheses = new ArrayList<>();
+        hypotheses.add(hypothesis);
+        Bundle bundle = new Bundle();
+        bundle.putStringArrayList(android.speech.SpeechRecognizer.RESULTS_RECOGNITION, hypotheses);
+        return bundle;
+    }
+
+    private void beginningOfSpeech() {
+        try {
+            mCallback.beginningOfSpeech();
+        } catch (RemoteException e) {
+            // empty
+        }
+    }
+
+    private void error(int errorCode) {
+        speechService.cancel();
+        try {
+            mCallback.error(errorCode);
+        } catch (RemoteException e) {
+            // empty
+        }
+    }
+
+    @Override
+    public void onResult(String hypothesis) {
+        if (hypothesis != null) {
+            Log.i(TAG, hypothesis);
+            results(createResultsBundle(hypothesis), true);
+        }
+    }
+
+    @Override
+    public void onPartialResult(String hypothesis) {
+        if (hypothesis != null) {
+            Log.i(TAG, hypothesis);
+            results(createResultsBundle(hypothesis), false);
+        }
+    }
+
+    @Override
+    public void onError(Exception e) {
+        Log.e(TAG, e.getMessage());
+        error(android.speech.SpeechRecognizer.ERROR_CLIENT);
+    }
+
+    @Override
+    public void onTimeout() {
+        speechService.cancel();
+        speechService.startListening();
+    }
+}
--- a/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java
+++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java
@ -0,0 +1,313 @@
+// Copyright 2019 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cat.oreilly.localstt;
+
+import static java.lang.String.format;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder.AudioSource;
+import android.os.Handler;
+import android.os.Looper;
+import android.util.Log;
+
+import org.kaldi.RecognitionListener;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState;
+
+import com.konovalov.vad.Vad;
+import com.konovalov.vad.VadConfig;
+
+/**
+ * Service that records audio in a thread, passes it to a recognizer and emits
+ * recognition results. Recognition events are passed to a client using
+ * {@link RecognitionListener}
+ *
+ */
+public class DeepSpeechService {
+
+    protected static final String TAG = DeepSpeechService.class.getSimpleName();
+
+    private final DeepSpeechModel model;
+    private final DeepSpeechStreamingState streamContext;
+    private final Vad vad;
+
+    private final int sampleRate;
+    private final static float BUFFER_SIZE_SECONDS = 0.4f;
+    private int bufferSize;
+    private final AudioRecord recorder;
+
+    private Thread recognizerThread;
+
+    private final Handler mainHandler = new Handler(Looper.getMainLooper());
+
+    private final Collection<RecognitionListener> listeners = new HashSet<RecognitionListener>();
+
+    /**
+     * Creates speech service. Service holds the AudioRecord object, so you need to
+     * call {@link release} in order to properly finalize it.
+     * 
+     * @throws IOException thrown if audio recorder can not be created for some
+     *                     reason.
+     */
+    public DeepSpeechService(DeepSpeechModel model, float sampleRate) throws IOException {
+        this.model = model;
+        this.sampleRate = (int) sampleRate;
+        this.streamContext = model.createStream();
+
+        vad = new Vad(VadConfig.newBuilder().setSampleRate(VadConfig.SampleRate.SAMPLE_RATE_16K)
+                .setFrameSize(VadConfig.FrameSize.FRAME_SIZE_480).setMode(VadConfig.Mode.NORMAL).build());
+
+        bufferSize = Math.round(this.sampleRate * BUFFER_SIZE_SECONDS);
+        recorder = new AudioRecord(AudioSource.VOICE_RECOGNITION, this.sampleRate, AudioFormat.CHANNEL_IN_MONO,
+                AudioFormat.ENCODING_PCM_16BIT, bufferSize * 2);
+
+        if (recorder.getState() == AudioRecord.STATE_UNINITIALIZED) {
+            recorder.release();
+            throw new IOException("Failed to initialize recorder. Microphone might be already in use.");
+        }
+        Log.i(TAG, "DeepSpeechService initialized");
+    }
+
+    /**
+     * Adds listener.
+     */
+    public void addListener(RecognitionListener listener) {
+        synchronized (listeners) {
+            listeners.add(listener);
+        }
+    }
+
+    /**
+     * Removes listener.
+     */
+    public void removeListener(RecognitionListener listener) {
+        synchronized (listeners) {
+            listeners.remove(listener);
+        }
+    }
+
+    /**
+     * Starts recognition. Does nothing if recognition is active.
+     * 
+     * @return true if recognition was actually started
+     */
+    public boolean startListening() {
+        if (null != recognizerThread)
+            return false;
+
+        recognizerThread = new RecognizerThread();
+        recognizerThread.start();
+        return true;
+    }
+
+    /**
+     * Starts recognition. After specified timeout listening stops and the
+     * endOfSpeech signals about that. Does nothing if recognition is active.
+     * 
+     * @timeout - timeout in milliseconds to listen.
+     * 
+     * @return true if recognition was actually started
+     */
+    public boolean startListening(int timeout) {
+        if (null != recognizerThread)
+            return false;
+
+        recognizerThread = new RecognizerThread(timeout);
+        recognizerThread.start();
+        return true;
+    }
+
+    private boolean stopRecognizerThread() {
+        if (null == recognizerThread)
+            return false;
+
+        try {
+            recognizerThread.interrupt();
+            recognizerThread.join();
+        } catch (InterruptedException e) {
+            // Restore the interrupted status.
+            Thread.currentThread().interrupt();
+        }
+
+        recognizerThread = null;
+        return true;
+    }
+
+    /**
+     * Stops recognition. All listeners should receive final result if there is any.
+     * Does nothing if recognition is not active.
+     * 
+     * @return true if recognition was actually stopped
+     */
+    public boolean stop() {
+        boolean result = stopRecognizerThread();
+        if (result) {
+            mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
+        }
+        return result;
+    }
+
+    /**
+     * Cancels recognition. Listeners do not receive final result. Does nothing if
+     * recognition is not active.
+     * 
+     * @return true if recognition was actually canceled
+     */
+    public boolean cancel() {
+        Log.d(TAG, "#cancel");
+        boolean result = stopRecognizerThread();
+        this.model.freeModel(); // Reset recognizer state
+        return result;
+    }
+
+    /**
+     * Shutdown the recognizer and release the recorder
+     */
+    public void shutdown() {
+        Log.d(TAG, "#shutdown");
+        this.model.freeModel();
+        recorder.release();
+    }
+
+    private final class RecognizerThread extends Thread {
+
+        private int remainingSamples;
+        private int timeoutSamples;
+        private final static int NO_TIMEOUT = -1;
+
+        public RecognizerThread(int timeout) {
+            if (timeout != NO_TIMEOUT)
+                this.timeoutSamples = timeout * sampleRate / 1000;
+            else
+                this.timeoutSamples = NO_TIMEOUT;
+            this.remainingSamples = this.timeoutSamples;
+        }
+
+        public RecognizerThread() {
+            this(NO_TIMEOUT);
+        }
+
+        @Override
+        public void run() {
+            Log.i(TAG, "Start Recording...");
+
+            vad.start();
+            recorder.startRecording();
+            if (recorder.getRecordingState() == AudioRecord.RECORDSTATE_STOPPED) {
+                recorder.stop();
+                IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
+                mainHandler.post(new OnErrorEvent(ioe));
+                return;
+            }
+
+            short[] buffer = new short[bufferSize];
+            int nread = recorder.read(buffer, 0, buffer.length);
+            boolean speechDetected = false;
+            boolean feedAudio = true;
+
+            while (!interrupted() && ((timeoutSamples == NO_TIMEOUT) || (remainingSamples > 0)) && feedAudio) {
+
+                if (nread < 0) {
+                    throw new RuntimeException("error reading audio buffer");
+                } else {
+                    Log.i(TAG, "Feeding audio");
+                    model.feedAudioContent(streamContext, buffer, nread);
+                    boolean isSpeech = vad.isSpeech(buffer);
+                    if (isSpeech) {
+                        Log.d(TAG, "Speech detected");
+                        speechDetected = true;
+                    }
+                    if (speechDetected && !isSpeech) {
+                        Log.d(TAG, "Silence detected");
+                        feedAudio = false;
+                    }
+
+                }
+
+                if (timeoutSamples != NO_TIMEOUT) {
+                    remainingSamples = remainingSamples - nread;
+                }
+                nread = recorder.read(buffer, 0, buffer.length);
+            }
+
+            mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
+
+            recorder.stop();
+            vad.stop();
+
+            // Remove all pending notifications.
+            mainHandler.removeCallbacksAndMessages(null);
+
+            // If we met timeout signal that speech ended
+            if (timeoutSamples != NO_TIMEOUT && remainingSamples <= 0) {
+                mainHandler.post(new TimeoutEvent());
+            }
+        }
+    }
+
+    private abstract class RecognitionEvent implements Runnable {
+        public void run() {
+            RecognitionListener[] emptyArray = new RecognitionListener[0];
+            for (RecognitionListener listener : listeners.toArray(emptyArray))
+                execute(listener);
+        }
+
+        protected abstract void execute(RecognitionListener listener);
+    }
+
+    private class ResultEvent extends RecognitionEvent {
+        protected final String hypothesis;
+        private final boolean finalResult;
+
+        ResultEvent(String hypothesis, boolean finalResult) {
+            this.hypothesis = hypothesis;
+            this.finalResult = finalResult;
+        }
+
+        @Override
+        protected void execute(RecognitionListener listener) {
+            if (finalResult)
+                listener.onResult(hypothesis);
+            else
+                listener.onPartialResult(hypothesis);
+        }
+    }
+
+    private class OnErrorEvent extends RecognitionEvent {
+        private final Exception exception;
+
+        OnErrorEvent(Exception exception) {
+            this.exception = exception;
+        }
+
+        @Override
+        protected void execute(RecognitionListener listener) {
+            listener.onError(exception);
+        }
+    }
+
+    private class TimeoutEvent extends RecognitionEvent {
+        @Override
+        protected void execute(RecognitionListener listener) {
+            listener.onTimeout();
+        }
+    }
+}
--- a/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java
+++ b/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java
@ -21,7 +21,7 @@ import java.util.ArrayList;
 import java.io.IOException;

 public class VoskRecognitionService extends RecognitionService implements RecognitionListener {
-    private final static String TAG = VoskRecognitionService.class.getName();
+    private final static String TAG = VoskRecognitionService.class.getSimpleName();
    private KaldiRecognizer recognizer;
    private SpeechService speechService;
    private Model model;
@ -38,13 +38,13 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
    @Override
    protected void onCancel(Callback callback) {
        Log.i(TAG, "onCancel");
-        results(new Bundle());
+        results(new Bundle(), true);
    }

    @Override
    protected void onStopListening(Callback callback) {
        Log.i(TAG, "onStopListening");
-        results(new Bundle());
+        results(new Bundle(), true);
    }

    private void runRecognizerSetup(final Intent intent) {
@ -108,10 +108,14 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
        }
    }

-    private void results(Bundle bundle) {
-        speechService.cancel();
+    private void results(Bundle bundle, boolean isFinal) {
        try {
-            mCallback.results(bundle);
+            if (isFinal) {
+                speechService.cancel();
+                mCallback.results(bundle);
+            } else {
+                mCallback.partialResults(bundle);
+            }
        } catch (RemoteException e) {
            // empty
        }
@ -149,7 +153,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
            Gson gson = new Gson();
            Map<String, String> map = gson.fromJson(hypothesis, Map.class);
            String text = map.get("text");
-            results(createResultsBundle(text));
+            results(createResultsBundle(text), true);
        }
    }

@ -160,7 +164,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
            Gson gson = new Gson();
            Map<String, String> map = gson.fromJson(hypothesis, Map.class);
            String text = map.get("partial");
-            results(createResultsBundle(text));
+            results(createResultsBundle(text), false);
        }
    }