前端语音转文字技术路线

最近踩坑了浏览器端的语音转文字,使用场景大概是如下

对着麦克风说唤醒词如 “你好小度”
然后说出命令如 “打开三维分析”
然后系统读取文字转拼音, 进行对应的操作

大概有如下几种路线

Web Speech API

https://developer.mozilla.org/zh-CN/docs/Web/API/Web_Speech_API

浏览器自带的api,chrome可以直接使用, 缺点是需要连外网, 由于项目部署在内网,此方案最终放弃.

vosk

后端找到的本地部署的语音模型,可通过docker部署

docker run -d -p 2700:2700 alphacep/kaldi-en:latest

前端通过websocket通信,发送AudioBuffer数据, 返回转换后的文本. 这里附上写好的vue组件

<template>
  <div class="sound-con" v-if="recognizing">
    <div class="sound-con">
      <img :src="micUrl" alt="">
      <div class="text" v-if="final_transcript">{{ final_transcript }}</div>
    </div>
  </div>
</template>

<script>
import {pinyin} from 'pinyin-pro';

export default {
  name: "SoundControl",
  data: () => {
    return {
      micUrl: '',
      recognizing: false,
      final_transcript: '',//最终生成的文本
      pyStr: '',// 用于识别命令的拼音
      wsUrl: 'ws://你的vosk服务地址',
      webSocket: '',
      streamLocal: '', //声音流
      context: "", // 声音处理上下文环境
      source: '', //声音处理source
      processor: '', //声音处理程序
      bufferSize: 8192 // 缓冲区大小
    }
  },
  mounted: function () {
    this.initWS()
    navigator.mediaDevices.getUserMedia({
      audio: {
        channelCount: 1,
        echoCancellation: true,
        noiseSuppression: true,
        sampleRate: 8000
      }, video: false
    }).then(this.handleStream)
  },
  methods: {
    handleStream(stream) {
      this.streamLocal = stream
      this.context = new AudioContext({sampleRate: 8000});
      this.source = this.context.createMediaStreamSource(stream);
      this.processor = this.context.createScriptProcessor(this.bufferSize, 1, 1);

      this.source.connect(this.processor);
      this.processor.connect(this.context.destination);

      this.processor.onaudioprocess = (audioDataChunk) => {
        // console.log('audioDataChunk.inputBuffer :%o', audioDataChunk.inputBuffer);
        this.sendAudio(audioDataChunk);
      };
    },
    sendAudio(audioDataChunk) {
      if (this.webSocket.readyState === WebSocket.OPEN) {
        // convert to 16-bit payload
        const inputData = audioDataChunk.inputBuffer.getChannelData(0) || new Float32Array(this.bufferSize);
        const targetBuffer = new Int16Array(inputData.length);
        for (let index = inputData.length; index > 0; index--) {
          targetBuffer[index] = 32767 * Math.min(1, inputData[index]);
        }
        this.webSocket.send(targetBuffer.buffer);
      }
    },
    initWS() {
      this.webSocket = new WebSocket(this.wsUrl);
      this.webSocket.binaryType = "arraybuffer";

      this.webSocket.onopen = function (event) {
        console.log('vosk语音转文本连接成功');
      };

      this.webSocket.onerror = function (event) {
        console.error(event.data);
      };

      this.webSocket.onmessage = (event) => {
        if (event.data) {
          let parsed = JSON.parse(event.data);
          if (parsed.text) {
            console.log('声音识别结果: %s', parsed.text)
            // 先用唤醒词识别 请求第一次接口 返回文字转拼音 如果有 xiao du 则手动开启录音
            // 模糊音处理 统一使用前鼻音
            this.pyStr = pinyin(parsed.text, {toneType: 'none'}).replaceAll(/ing/g, 'in').replaceAll(/eng/g, 'en')
            if (this.pyStr.includes('xiao du')) {
              this.recognizing = true
              this.final_transcript = '我在! 您可以尝试说: 打开开挖分析'
              setTimeout(() => {
                this.recognizing = false
                this.final_transcript = ''
              }, 100000)
            } else {
              if (this.recognizing) {
                this.final_transcript = parsed.text
                this.pyStr = pinyin(parsed.text, {toneType: 'none'}).replaceAll(/ing/g, 'in')
                  .replaceAll(/eng/g, 'en').trim()
                  .replaceAll(/\s+/g,' ')
                this.$store.commit('setSoundControl', this.pyStr)
              }
            }
          }
        }
      };
    }
  },
  watch: {
    recognizing: {
      handler(n) {
        this.micUrl = n ? require('@/assets/img/sound_record/mic-animate.gif')
          : require('@/assets/img/sound_record/mic.gif')
      }
    }
  }
}
</script>

<style lang="scss" scoped>
.sound-con {
  position: fixed;
  left: 500px;
  bottom: 20px;
  cursor: pointer;
  background: rgba(60, 115, 102, 0.8);
  border-radius: 20px;
  height: 50px;

  display: flex;
  align-items: center;

  .text {
    width: auto;
    height: 50px;
    color: white;
    border-radius: 20px;
    background: rgba(60, 115, 102, 0.8);
    margin-left: 20px;
    margin-right: 20px;
    line-height: 50px;
  }
}
</style>

还有一个js, 可以将录音的流转换为blob,进而生成文件

class Recorder {
  constructor(stream, config) {
    this.config = config || {};
    this.config.sampleBits = 16;      //采样数位 8, 16
    this.config.sampleRate = 16000;   //采样率(1/6 44100)
    this.context = new AudioContext();
    this.audioInput = this.context.createMediaStreamSource(stream);
    this.recorder = this.context.createScriptProcessor(4096, 1, 1);
    this.size = 0          //录音文件长度
    this.buffer = []     //录音缓存
    this.inputDataList = []
    this.inputSampleRate = this.context.sampleRate    //输入采样率
    this.inputSampleBits = 16       //输入采样数位 8, 16
    this.outputSampleRate = this.config.sampleRate    //输出采样率
    this.oututSampleBits = this.config.sampleBits       //输出采样数位 8, 16

    //音频采集
    this.recorder.onaudioprocess =  (e) => {
      this.input(e.inputBuffer.getChannelData(0));
      this.inputDataList.push(e)
    }
  }

  input(data){
    this.buffer.push(new Float32Array(data));
    this.size += data.length;
  }
  compress () { //合并压缩
    //合并
    let data = new Float32Array(this.size);
    let offset = 0;
    for (let i = 0; i < this.buffer.length; i++) {
      data.set(this.buffer[i], offset);
      offset += this.buffer[i].length;
    }
    //压缩
    let compression = parseInt(this.inputSampleRate / this.outputSampleRate);
    let length = data.length / compression;
    let result = new Float32Array(length);
    let index = 0, j = 0;
    while (index < length) {
      result[index] = data[j];
      j += compression;
      index++;
    }
    return result;
  }

  encodeWAV (){
    let sampleRate = Math.min(this.inputSampleRate, this.outputSampleRate);
    let sampleBits = Math.min(this.inputSampleBits, this.oututSampleBits);
    let bytes = this.compress();
    let dataLength = bytes.length * (sampleBits / 8);
    let buffer = new ArrayBuffer(44 + dataLength);
    let data = new DataView(buffer);


    let channelCount = 1;//单声道
    let offset = 0;


    let writeString = function (str) {
      for (let i = 0; i < str.length; i++) {
        data.setUint8(offset + i, str.charCodeAt(i));
      }
    }

    // 资源交换文件标识符
    writeString('RIFF');
    offset += 4;
    // 下个地址开始到文件尾总字节数,即文件大小-8
    data.setUint32(offset, 36 + dataLength, true);
    offset += 4;
    // WAV文件标志
    writeString('WAVE');
    offset += 4;
    // 波形格式标志
    writeString('fmt ');
    offset += 4;
    // 过滤字节,一般为 0x10 = 16
    data.setUint32(offset, 16, true);
    offset += 4;
    // 格式类别 (PCM形式采样数据)
    data.setUint16(offset, 1, true);
    offset += 2;
    // 通道数
    data.setUint16(offset, channelCount, true);
    offset += 2;
    // 采样率,每秒样本数,表示每个通道的播放速度
    data.setUint32(offset, sampleRate, true);
    offset += 4;
    // 波形数据传输率 (每秒平均字节数) 单声道×每秒数据位数×每样本数据位/8
    data.setUint32(offset, channelCount * sampleRate * (sampleBits / 8), true);
    offset += 4;
    // 快数据调整数 采样一次占用字节数 单声道×每样本的数据位数/8
    data.setUint16(offset, channelCount * (sampleBits / 8), true);
    offset += 2;
    // 每样本数据位数
    data.setUint16(offset, sampleBits, true);
    offset += 2;
    // 数据标识符
    writeString('data');
    offset += 4;
    // 采样数据总数,即数据总大小-44
    data.setUint32(offset, dataLength, true);
    offset += 4;
    // 写入采样数据
    if (sampleBits === 8) {
      for (let i = 0; i < bytes.length; i++, offset++) {
        let s = Math.max(-1, Math.min(1, bytes[i]));
        let val = s < 0 ? s * 0x8000 : s * 0x7FFF;
        val = parseInt(255 / (65535 / (val + 32768)));
        data.setInt8(offset, val, true);
      }
    } else {
      for (let i = 0; i < bytes.length; i++, offset += 2) {
        let s = Math.max(-1, Math.min(1, bytes[i]));
        data.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
      }
    }


    return new Blob([data], {type: 'audio/wav'});
  }

  start(){
    this.audioInput.connect(this.recorder);
    this.recorder.connect(this.context.destination);
  }
  stop() {
    this.recorder.disconnect();
  }

  getBlob () {
    this.stop();
    const data = this.encodeWAV()
    this.size = 0          //录音文件长度
    this.buffer = []
    return data
  }

  play(audio) {
    audio.src = window.URL.createObjectURL(this.getBlob());
  }

}

export default Recorder

Whisper Asr Webservice

这个是我在github上找的,基于openAi的whisper 实现
也可以通过docker部署详情参见github

https://github.com/ahmetoner/whisper-asr-webservice/
https://github.com/openai/whisper/

部署后, 测试wav文件是可以得到返回结果的

#前端 #语音

前端语音转文字技术路线

http://samkallon.top/blog/2023/08/21/前端语音转文字/

作者

samkallon

发布于

2023年8月21日

许可协议

Cesium实现横断面分析上一篇

xlsx导出宽度自适应下一篇