BackgroundAudio/BackgroundAudioSpeech_8h_source.html

/*

    BackgroundAudio

    Plays an audio file using IRQ driven decompression.  Main loop() writes

    data to the buffer but isn't blocked while playing


    Copyright (c) 2024 Earle F. Philhower, III <earlephilhower@yahoo.com>


    This program is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/


#pragma once

#include <Arduino.h>

#include "WrappedAudioOutputBase.h"

#include "BackgroundAudioGain.h"

#include "BackgroundAudioBuffers.h"

#include "libespeak-ng/espeak-ng/speak_lib.h"

#include "libespeak-ng/phoneme/phonindex.h"

#include "libespeak-ng/phoneme/phontab.h"

#include "libespeak-ng/phoneme/phondata.h"

#include "libespeak-ng/phoneme/intonations.h"


// These will be defined by including a language

extern const unsigned char __espeakng_dict[];

extern size_t __espeakng_dictlen;


typedef struct {

    const char *name;

    size_t len; // Size of binary data

    const unsigned char *data;

} BackgroundAudioVoice;


template<class DataBuffer>


class BackgroundAudioSpeechClass {

public:

    BackgroundAudioSpeechClass() {

        _playing = false;

        _out = nullptr;

        _paused = false;

        _voice = nullptr;

    }


    BackgroundAudioSpeechClass(AudioOutputBase &d) {

        _playing = false;

        _paused = false;

        setDevice(&d);

    }


    ~BackgroundAudioSpeechClass() {}


    bool setDevice(AudioOutputBase *d) {

        if (!_playing) {

            _out = d;

            return true;

        }

        return false;

    }


    void setGain(float scale) {

        _gain = (int32_t)(scale * (1 << 16));

    }


    void setVoice(BackgroundAudioVoice &v) {

        _voice = v.data;

        _voiceLen = v.len;

    }


    void setRate(int rate) {

        espeak_SetParameter(espeakRATE, rate, 0);

    }


    void setPitch(int pitch) {

        espeak_SetParameter(espeakPITCH, pitch, 0);

    }


    void setWordGap(int gap) {

        espeak_SetParameter(espeakWORDGAP, gap, 0);

    }


    bool begin() {

        if (_playing || !_voice || !_voiceLen) {

            return false;

        }


        espeak_EnableSingleStep();

        espeak_InstallDict(__espeakng_dict, __espeakng_dictlen);

        espeak_InstallPhonIndex(_phonindex, sizeof(_phonindex));

        espeak_InstallPhonTab(_phontab, sizeof(_phontab));

        espeak_InstallPhonData(_phondata, sizeof(_phondata));

        espeak_InstallIntonations(_intonations, sizeof(_intonations));

        espeak_InstallVoice(_voice, _voiceLen);


        int samplerate = espeak_Initialize(AUDIO_OUTPUT_SYNCH_PLAYBACK, 20, nullptr, 0);

        espeak_SetVoiceByFile("INTERNAL");

        espeak_SetSynthCallback(_speechCB);


        // We will use natural frame size to minimize mismatch

        _out->setBuffers(5, framelen);

        _out->onTransmit(&_cb, (void *)this); // The pump we will use to generate our audio

        _out->setBitsPerSample(16);

        _out->setStereo(true);

        _out->setFrequency(samplerate);

        _out->begin();


        // Stuff with silence to start

        uint16_t zeros[32] __attribute__((aligned(4))) = {};

        while (_out->availableForWrite() > 32) {

            _out->write((uint8_t *)zeros, sizeof(zeros));

        }


        _playing = true;


        return true;

    }


    void end() {

        if (_playing) {

            _out->end();

        }

        _playing = false;

    }


    bool playing() {

        return _playing;

    }


    size_t write(const void *data, size_t len) {

        return _ib.write((const uint8_t *)data, len);

    }


    size_t speak(const char *string) {

        if (availableForWrite() < strlen(string)) {

            return 0; // All or nothing

        }

        return write((const void *)string, strlen(string) + 1);

    }


    size_t speak(const String &string) {

        return speak(string.c_str());

    }


    size_t availableForWrite() {

        return _ib.availableForWrite();

    }


    size_t available() {

        return _ib.available();

    }


    bool done() {

        return !available() && !_generatingSpeech;

    }


    uint32_t frames() {

        return _frames;

    }


    uint32_t shifts() {

        return _shifts;

    }


    uint32_t underflows() {

        return _underflows;

    }


    uint32_t errors() {

        return _errors;

    }


    uint32_t dumps() {

        return _dumps;

    }


    void pause() {

        _paused = true;

    }


    bool paused() {

        return _paused;

    }


    void unpause() {

        _paused = false;

    }


    void flush() {

        noInterrupts();

        _ib.flush();

        _generatingSpeech = false;

        short *mono;

        espeak_SynthesizeOneStep(&mono); // Thrown out

        espeak_AbortSynthesis();

        interrupts();

    }


private:

    static void _cb(void *ptr) {

        ((BackgroundAudioSpeechClass*)ptr)->pump();

    }


    static int _speechCB(short *data, int count, espeak_EVENT *events) {

        return 0; // Should never really be called by ESpeak internals

    }


    void generateOneFrame() {

        _frameLen = 0;


        // If we're not currently synthesizng speech, is there another string we can say?

        if (!_generatingSpeech) {

            if (_ib.available()) {

                const uint8_t *b = _ib.buffer();

                for (int i = 0; i < (int)_ib.available(); i++) {

                    if (!b[i]) {

                        espeak_Synth(_ib.buffer(), i, 0, (espeak_POSITION_TYPE)0, 0, espeakCHARS_AUTO, 0, this);

                        _generatingSpeech = true;

                        break;

                    }

                }

            }

        }


        if (_generatingSpeech && !_frameLen) {

            // Generate the raw samples

            short *mono;

            _frameLen = std::min(espeak_SynthesizeOneStep(&mono), framelen);

            // Now convert to stereo by duplicating channels, store in frame buffer

            int16_t *ptr = _frame;

            for (int i = 0; i < _frameLen; i++) {

                *ptr++ = *mono;

                *ptr++ = *mono++;

            }

            // Amplify if requested

            ApplyGain(_frame, _frameLen * 2, _gain);

            // Advance synthesis state and check if done

            if (!espeak_SynthesisGenerateNext()) {

                _generatingSpeech = false;

                _ib.shiftUp(strlen((const char *)_ib.buffer()) + 1); // Only shift out the speech once it's done speaking, easier to track

                _shifts++;

            }

        }

    }


    void pump() {

        while (_out->availableForWrite() >= (int)framelen) {

            if (!_frameLen && !_paused) {

                generateOneFrame();

            }

            if (_paused || !_frameLen) {

                bzero(_frame, sizeof(_frame));

                _out->write((uint8_t *)_frame, sizeof(_frame));

            } else {

                _frameLen -= _out->write((uint8_t *)_frame, _frameLen * 4) / 4;

            }

        }

    }


private:

    AudioOutputBase *_out;

    bool _playing = false;

    bool _paused = false;

    DataBuffer _ib;

    int32_t _gain = 1 << 16;

    bool _generatingSpeech = false;

    static constexpr int framelen = 1324; // From the 22050 normal samplerate and 20 length

    int16_t _frame[framelen * 2]; // Overprovision in case we get a long speech frame

    int _frameLen = 0;


    const unsigned char *_dict;

    size_t _dictLen;

    const unsigned char *_voice;

    size_t _voiceLen;


    // Quality stats, cumulative

    uint32_t _frames = 0;

    uint32_t _shifts = 0;

    uint32_t _underflows = 0;

    uint32_t _errors = 0;

    uint32_t _dumps = 0;

};


using BackgroundAudioSpeech = BackgroundAudioSpeechClass<RawDataBuffer<1024>>;

BackgroundAudioSpeechClass
Interrupt-driven ESpeak-NG instance. Generates a full frame of samples each cycle and uses the RawBuf...
Definition BackgroundAudioSpeech.h:56

BackgroundAudioSpeechClass::done
bool done()
Determine if no more speech is present in the buffer.
Definition BackgroundAudioSpeech.h:274

BackgroundAudioSpeechClass::setWordGap
void setWordGap(int gap)
Adjust the interword gap after begin()
Definition BackgroundAudioSpeech.h:135

BackgroundAudioSpeechClass::underflows
uint32_t underflows()
Get the number of times the speaker has underflowed waiting on raw data since begin
Definition BackgroundAudioSpeech.h:301

BackgroundAudioSpeechClass::playing
bool playing()
Determines if the speaker has been started.
Definition BackgroundAudioSpeech.h:195

BackgroundAudioSpeechClass::speak
size_t speak(const char *string)
Speaks a C-String.
Definition BackgroundAudioSpeech.h:227

BackgroundAudioSpeechClass::BackgroundAudioSpeechClass
BackgroundAudioSpeechClass(AudioOutputBase &d)
Construct an output device using the specified physical audio output.
Definition BackgroundAudioSpeech.h:70

BackgroundAudioSpeechClass::setDevice
bool setDevice(AudioOutputBase *d)
Set an output device before begin
Definition BackgroundAudioSpeech.h:85

BackgroundAudioSpeechClass::paused
bool paused()
Determine if the playback is paused.
Definition BackgroundAudioSpeech.h:335

BackgroundAudioSpeechClass::shifts
uint32_t shifts()
Get the number of input data shifts processed by decoder since begin
Definition BackgroundAudioSpeech.h:292

BackgroundAudioSpeechClass::setPitch
void setPitch(int pitch)
Adjust the pitch, 0...99, with 50 default. After begin()
Definition BackgroundAudioSpeech.h:126

BackgroundAudioSpeechClass::pause
void pause()
Pause the decoder. Won't process raw input data and will transmit silence.
Definition BackgroundAudioSpeech.h:326

BackgroundAudioSpeechClass::frames
uint32_t frames()
Get number of "frames" processed by speaker.
Definition BackgroundAudioSpeech.h:283

BackgroundAudioSpeechClass::dumps
uint32_t dumps()
Get the number of full buffer dumps (catastrophic data error) since begin
Definition BackgroundAudioSpeech.h:319

BackgroundAudioSpeechClass::availableForWrite
size_t availableForWrite()
Gets number of bytes available to write to raw buffer.
Definition BackgroundAudioSpeech.h:256

BackgroundAudioSpeechClass::unpause
void unpause()
Unpause previously paused playback. Will start processing input data again.
Definition BackgroundAudioSpeech.h:343

BackgroundAudioSpeechClass::begin
bool begin()
Starts the background speaker. Will initialize the output device and start sending silence immediatel...
Definition BackgroundAudioSpeech.h:144

BackgroundAudioSpeechClass::available
size_t available()
Gets number of bytes already in the raw buffer.
Definition BackgroundAudioSpeech.h:265

BackgroundAudioSpeechClass::speak
size_t speak(const String &string)
Speaks an Arduino String.
Definition BackgroundAudioSpeech.h:240

BackgroundAudioSpeechClass::setRate
void setRate(int rate)
Set the speaking rate in ~wpm, after calling begin()
Definition BackgroundAudioSpeech.h:117

BackgroundAudioSpeechClass::end
void end()
Stops the process and the calls the output device's end to shut it down, too.
Definition BackgroundAudioSpeech.h:183

BackgroundAudioSpeechClass::write
size_t write(const void *data, size_t len)
Writes a block of raw data to the decoder's buffer.
Definition BackgroundAudioSpeech.h:217

BackgroundAudioSpeechClass::setVoice
void setVoice(BackgroundAudioVoice &v)
Sets the voice parameters (language customization)
Definition BackgroundAudioSpeech.h:107

BackgroundAudioSpeechClass::flush
void flush()
Flushes any existing raw data, resets the processor to start a new speaking.
Definition BackgroundAudioSpeech.h:354

BackgroundAudioSpeechClass::setGain
void setGain(float scale)
Set the gain multiplier (volume) for the stream. Takes effect immediately.
Definition BackgroundAudioSpeech.h:98

BackgroundAudioSpeechClass::errors
uint32_t errors()
Get the number of decoder errors since begin
Definition BackgroundAudioSpeech.h:310

BackgroundAudioVoice
Structure to collect a ESpeak-NG voice with its human-readable name.
Definition BackgroundAudioSpeech.h:40

BackgroundAudioVoice::name
const char * name
Definition BackgroundAudioSpeech.h:42

BackgroundAudioVoice::data
const unsigned char * data
Definition BackgroundAudioSpeech.h:46

BackgroundAudioVoice::len
size_t len
Definition BackgroundAudioSpeech.h:44