|
|
马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。
您需要 登录 才可以下载或查看,没有账号?注册
×
作者:微信文章
录音文件准备先将录音文件下载保存好,并转换为wav格式。可以使用ffmpeg:ffmpeg -i input.mp3 output.wav通话录音转文字使用openai开源的whisper可以很方便的将语音转换为文字。我使用的是large-v3-turbo模型,效果和速度都可以。上代码:model = whisper.load_model("large-v3-turbo")text_result = model.transcribe(audio_path, language="Chinese")print(f"Transcription completed. {text_result}")转换结果格式如:{ "segments": [ { "start": 0.0, "end": 1.44, "text": "你好" }, { "start": 1.44, "end": 3.56, "text": "我是那个充电桩的" }, { "start": 3.56, "end": 5.68, "text": "我看你在网上咨询充电桩是吧" }, { "start": 5.68, "end": 6.58, "text": "对对对" }, { "start": 6.58, "end": 8.4, "text": "您的场地大概什么情况" }, { "start": 8.4, "end": 8.96, "text": "你给我说一下" }, { "start": 8.96, "end": 10.92, "text": "我给你匹配一下合适的产品" }, { "start": 10.92, "end": 12.32, "text": "不是场地" }, { "start": 12.32, "end": 15.0, "text": "我们是装在那个设备上面" }, { "start": 15.0, "end": 16.88, "text": "什么设备" }, { "start": 16.88, "end": 19.16, "text": "光伏是吧" }, { "start": 19.16, "end": 20.2, "text": "光伏板" } ], "language": "Chinese"}说话人分离使用pyannote/speaker-diarization-3.1可以将说话人分离出来,先要获取一个huggingface的access token,根据huggingface指引配置即可。代码如下:pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="xxx")pipeline.to(torch.device("cpu"))diarization = pipeline(audio_path, min_speakers=2, max_speakers=2)speaker_segments = []for turn, _, speaker in diarization.itertracks(yield_label=True): speaker_segments.append((turn.start, turn.end, speaker))speaker_segments.sort(key=lambda x: x[0])结果如下:
[ [ 0.92534375, 1.66784375, "SPEAKER_01" ], [ 1.22909375, 1.66784375, "SPEAKER_00" ], [ 2.14034375, 5.228468750000001, "SPEAKER_00" ], [ 2.61284375, 2.79846875, "SPEAKER_01" ], [ 5.71784375, 6.51096875, "SPEAKER_01" ], [ 6.61221875, 7.01721875, "SPEAKER_00" ], [ 7.16909375, 10.30784375, "SPEAKER_00" ], [ 10.965968750000002, 11.657843750000001, "SPEAKER_01" ], [ 11.860343750000002, 15.04971875, "SPEAKER_01" ], [ 16.28159375, 16.95659375, "SPEAKER_00" ], [ 16.29846875, 17.024093750000002, "SPEAKER_01" ], [ 17.73284375, 17.74971875, "SPEAKER_01" ], [ 17.74971875, 17.766593750000002, "SPEAKER_00" ], [ 17.766593750000002, 18.64409375, "SPEAKER_01" ], [ 17.78346875, 18.86346875, "SPEAKER_00" ], [ 19.184093750000002, 21.07409375, "SPEAKER_01" ]]将文字和说话人关联分两种情况:1、文字和说话人时间重叠,取重叠时间最长的那个说话人即可;2、文字和说话人无重叠,找距离最近的说话人。def calculate_overlap_duration(self,text_start, text_end, speaker_start, speaker_end): """计算两个时间段的重叠时长""" overlap_start = max(text_start, speaker_start) overlap_end = min(text_end, speaker_end) overlap_duration = max(0, overlap_end - overlap_start) return overlap_duration
def find_best_speaker_for_text_by_over(self,text_start, text_end, all_segments): """通过重叠找到最佳匹配的说话人""" overlaps = [] for speaker_start, speaker_end, speaker in all_segments: overlap_duration = self.calculate_overlap_duration(text_start, text_end, speaker_start, speaker_end) if overlap_duration > 0: overlaps.append((overlap_duration, speaker)) # 按重叠时长排序,取最长的 if overlaps: overlaps.sort(key=lambda x: x[0], reverse=True) return overlaps[0][1] else: return "UNKNOWN"
def find_best_speaker_for_text(self,text_start, text_end, all_segments): """为文本片段找到最佳匹配的说话人""" result = self.find_best_speaker_for_text_by_over(text_start=text_start, text_end=text_end, all_segments=all_segments); if result != "UNKNOWN": return result # 如果没有重叠,则找距离最近的说话人 else: min_distance = 9999 best_speaker = "UNKNOWN" for speaker_start, speaker_end, speaker in all_segments: if text_end < speaker_start: distance = speaker_start - text_end elif text_start > speaker_end: distance = text_start - speaker_end else: distance = 0 # 有重叠的情况已经在前面处理过了 if distance < min_distance: min_distance = distance best_speaker = speaker return best_speakertext_segments_with_speaker = []for segment in text_result['segments']: text_start = segment['start'] text_end = segment['end'] text_content = segment['text'] best_speaker = self.find_best_speaker_for_text(text_start, text_end, speaker_segments) text_segments_with_speaker.append({ "speaker": best_speaker, "text": text_content })# 合并相邻且 speaker 一致的文本片段merged_segments = []for seg in text_segments_with_speaker: if merged_segments and merged_segments[-1]["speaker"] == seg["speaker"]: merged_segments[-1]["text"] += " " + seg["text"] else: merged_segments.append(seg)最终结果如下:{ "segments": [ { "speaker": "SPEAKER_01", "text": "你好" }, { "speaker": "SPEAKER_00", "text": "我是那个充电桩的 我看你在网上咨询充电桩是吧" }, { "speaker": "SPEAKER_01", "text": "对对对" }, { "speaker": "SPEAKER_00", "text": "您的场地大概什么情况 你给我说一下 我给你匹配一下合适的产品" }, { "speaker": "SPEAKER_01", "text": "不是场地 我们是装在那个设备上面" }, { "speaker": "SPEAKER_00", "text": "什么设备 光伏是吧" }, { "speaker": "SPEAKER_01", "text": "光伏板" } ]}后记其实代码都是AI写的,而且,录音文件准备中,mp3转wav是报错时AI提供的解决方案。天已经变了 |
|