前端语音转文字技术路线

最近踩坑了浏览器端的语音转文字,使用场景大概是如下

对着麦克风说唤醒词 如 “你好小度”
然后 说出命令 如 “打开三维分析”
然后系统读取文字 转拼音, 进行对应的操作

大概有如下几种路线

Web Speech API

https://developer.mozilla.org/zh-CN/docs/Web/API/Web_Speech_API

浏览器自带的api,chrome可以直接使用, 缺点是需要连外网, 由于项目部署在内网,此方案最终放弃.


vosk

后端找到的本地部署的语音模型,可通过docker部署

docker run -d -p 2700:2700 alphacep/kaldi-en:latest

前端通过websocket通信,发送AudioBuffer数据, 返回转换后的文本. 这里附上写好的vue组件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
<template>
<div class="sound-con" v-if="recognizing">
<div class="sound-con">
<img :src="micUrl" alt="">
<div class="text" v-if="final_transcript">{{ final_transcript }}</div>
</div>
</div>
</template>

<script>
import {pinyin} from 'pinyin-pro';

export default {
name: "SoundControl",
data: () => {
return {
micUrl: '',
recognizing: false,
final_transcript: '',//最终生成的文本
pyStr: '',// 用于识别命令的拼音
wsUrl: 'ws://你的vosk服务地址',
webSocket: '',
streamLocal: '', //声音流
context: "", // 声音处理上下文环境
source: '', //声音处理source
processor: '', //声音处理程序
bufferSize: 8192 // 缓冲区大小
}
},
mounted: function () {
this.initWS()
navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
sampleRate: 8000
}, video: false
}).then(this.handleStream)
},
methods: {
handleStream(stream) {
this.streamLocal = stream
this.context = new AudioContext({sampleRate: 8000});
this.source = this.context.createMediaStreamSource(stream);
this.processor = this.context.createScriptProcessor(this.bufferSize, 1, 1);

this.source.connect(this.processor);
this.processor.connect(this.context.destination);

this.processor.onaudioprocess = (audioDataChunk) => {
// console.log('audioDataChunk.inputBuffer :%o', audioDataChunk.inputBuffer);
this.sendAudio(audioDataChunk);
};
},
sendAudio(audioDataChunk) {
if (this.webSocket.readyState === WebSocket.OPEN) {
// convert to 16-bit payload
const inputData = audioDataChunk.inputBuffer.getChannelData(0) || new Float32Array(this.bufferSize);
const targetBuffer = new Int16Array(inputData.length);
for (let index = inputData.length; index > 0; index--) {
targetBuffer[index] = 32767 * Math.min(1, inputData[index]);
}
this.webSocket.send(targetBuffer.buffer);
}
},
initWS() {
this.webSocket = new WebSocket(this.wsUrl);
this.webSocket.binaryType = "arraybuffer";

this.webSocket.onopen = function (event) {
console.log('vosk语音转文本连接成功');
};

this.webSocket.onerror = function (event) {
console.error(event.data);
};

this.webSocket.onmessage = (event) => {
if (event.data) {
let parsed = JSON.parse(event.data);
if (parsed.text) {
console.log('声音识别结果: %s', parsed.text)
// 先用唤醒词识别 请求第一次接口 返回文字转拼音 如果有 xiao du 则手动开启录音
// 模糊音处理 统一使用前鼻音
this.pyStr = pinyin(parsed.text, {toneType: 'none'}).replaceAll(/ing/g, 'in').replaceAll(/eng/g, 'en')
if (this.pyStr.includes('xiao du')) {
this.recognizing = true
this.final_transcript = '我在! 您可以尝试说: 打开开挖分析'
setTimeout(() => {
this.recognizing = false
this.final_transcript = ''
}, 100000)
} else {
if (this.recognizing) {
this.final_transcript = parsed.text
this.pyStr = pinyin(parsed.text, {toneType: 'none'}).replaceAll(/ing/g, 'in')
.replaceAll(/eng/g, 'en').trim()
.replaceAll(/\s+/g,' ')
this.$store.commit('setSoundControl', this.pyStr)
}
}
}
}
};
}
},
watch: {
recognizing: {
handler(n) {
this.micUrl = n ? require('@/assets/img/sound_record/mic-animate.gif')
: require('@/assets/img/sound_record/mic.gif')
}
}
}
}
</script>

<style lang="scss" scoped>
.sound-con {
position: fixed;
left: 500px;
bottom: 20px;
cursor: pointer;
background: rgba(60, 115, 102, 0.8);
border-radius: 20px;
height: 50px;

display: flex;
align-items: center;

.text {
width: auto;
height: 50px;
color: white;
border-radius: 20px;
background: rgba(60, 115, 102, 0.8);
margin-left: 20px;
margin-right: 20px;
line-height: 50px;
}
}
</style>

还有一个js, 可以将录音的流转换为blob,进而生成文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class Recorder {
constructor(stream, config) {
this.config = config || {};
this.config.sampleBits = 16; //采样数位 8, 16
this.config.sampleRate = 16000; //采样率(1/6 44100)
this.context = new AudioContext();
this.audioInput = this.context.createMediaStreamSource(stream);
this.recorder = this.context.createScriptProcessor(4096, 1, 1);
this.size = 0 //录音文件长度
this.buffer = [] //录音缓存
this.inputDataList = []
this.inputSampleRate = this.context.sampleRate //输入采样率
this.inputSampleBits = 16 //输入采样数位 8, 16
this.outputSampleRate = this.config.sampleRate //输出采样率
this.oututSampleBits = this.config.sampleBits //输出采样数位 8, 16

//音频采集
this.recorder.onaudioprocess = (e) => {
this.input(e.inputBuffer.getChannelData(0));
this.inputDataList.push(e)
}
}

input(data){
this.buffer.push(new Float32Array(data));
this.size += data.length;
}
compress () { //合并压缩
//合并
let data = new Float32Array(this.size);
let offset = 0;
for (let i = 0; i < this.buffer.length; i++) {
data.set(this.buffer[i], offset);
offset += this.buffer[i].length;
}
//压缩
let compression = parseInt(this.inputSampleRate / this.outputSampleRate);
let length = data.length / compression;
let result = new Float32Array(length);
let index = 0, j = 0;
while (index < length) {
result[index] = data[j];
j += compression;
index++;
}
return result;
}

encodeWAV (){
let sampleRate = Math.min(this.inputSampleRate, this.outputSampleRate);
let sampleBits = Math.min(this.inputSampleBits, this.oututSampleBits);
let bytes = this.compress();
let dataLength = bytes.length * (sampleBits / 8);
let buffer = new ArrayBuffer(44 + dataLength);
let data = new DataView(buffer);


let channelCount = 1;//单声道
let offset = 0;


let writeString = function (str) {
for (let i = 0; i < str.length; i++) {
data.setUint8(offset + i, str.charCodeAt(i));
}
}

// 资源交换文件标识符
writeString('RIFF');
offset += 4;
// 下个地址开始到文件尾总字节数,即文件大小-8
data.setUint32(offset, 36 + dataLength, true);
offset += 4;
// WAV文件标志
writeString('WAVE');
offset += 4;
// 波形格式标志
writeString('fmt ');
offset += 4;
// 过滤字节,一般为 0x10 = 16
data.setUint32(offset, 16, true);
offset += 4;
// 格式类别 (PCM形式采样数据)
data.setUint16(offset, 1, true);
offset += 2;
// 通道数
data.setUint16(offset, channelCount, true);
offset += 2;
// 采样率,每秒样本数,表示每个通道的播放速度
data.setUint32(offset, sampleRate, true);
offset += 4;
// 波形数据传输率 (每秒平均字节数) 单声道×每秒数据位数×每样本数据位/8
data.setUint32(offset, channelCount * sampleRate * (sampleBits / 8), true);
offset += 4;
// 快数据调整数 采样一次占用字节数 单声道×每样本的数据位数/8
data.setUint16(offset, channelCount * (sampleBits / 8), true);
offset += 2;
// 每样本数据位数
data.setUint16(offset, sampleBits, true);
offset += 2;
// 数据标识符
writeString('data');
offset += 4;
// 采样数据总数,即数据总大小-44
data.setUint32(offset, dataLength, true);
offset += 4;
// 写入采样数据
if (sampleBits === 8) {
for (let i = 0; i < bytes.length; i++, offset++) {
let s = Math.max(-1, Math.min(1, bytes[i]));
let val = s < 0 ? s * 0x8000 : s * 0x7FFF;
val = parseInt(255 / (65535 / (val + 32768)));
data.setInt8(offset, val, true);
}
} else {
for (let i = 0; i < bytes.length; i++, offset += 2) {
let s = Math.max(-1, Math.min(1, bytes[i]));
data.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
}


return new Blob([data], {type: 'audio/wav'});
}

start(){
this.audioInput.connect(this.recorder);
this.recorder.connect(this.context.destination);
}
stop() {
this.recorder.disconnect();
}

getBlob () {
this.stop();
const data = this.encodeWAV()
this.size = 0 //录音文件长度
this.buffer = []
return data
}

play(audio) {
audio.src = window.URL.createObjectURL(this.getBlob());
}

}

export default Recorder

Whisper Asr Webservice

这个是我在github上找的,基于openAi的whisper 实现
也可以通过docker部署 详情参见github

https://github.com/ahmetoner/whisper-asr-webservice/
https://github.com/openai/whisper/

部署后, 测试wav文件是可以得到返回结果的


前端语音转文字技术路线
http://samkallon.top/blog/2023/08/21/前端语音转文字/
作者
samkallon
发布于
2023年8月21日
许可协议