火山引擎 语音合成 完全自费 亲测好用!懒癌患者滑到最后!
hhs
撰写于 2024年 04月 04 日

火山引擎 语音合成 完全自费 亲测好用!

说在前面

不是魔音和逗哥用不起,而是自研更有性价比

为此,找到了最简单的demo【官方提供】

#coding=utf-8

'''
requires Python 3.6 or later
pip install requests
'''
import base64
import json
import uuid
import requests

# 填写平台申请的appid, access_token以及cluster
appid = "************"
access_token= "***********"
cluster = "volcano_tts"

voice_type = "BV102_streaming"
host = "openspeech.bytedance.com"
api_url = f"https://{host}/api/v1/tts"

header = {"Authorization": f"Bearer;{access_token}"}

request_json = {
    "app": {
        "appid": appid,
        "token": "access_token",
        "cluster": cluster
    },
    "user": {
        "uid": "********"
    },
    "audio": {
        "voice_type": voice_type,
        "encoding": "mp3",
        "speed_ratio": 1.0,
        "volume_ratio": 1.0,
        "pitch_ratio": 1.0,
    },
    "request": {
        "reqid": str(uuid.uuid4()),
        "text": "字节跳动语音合成",
        "text_type": "plain",
        "operation": "query",
        "with_frontend": 1,
        "frontend_type": "unitTson"

    }
}

if __name__ == '__main__':
    try:
        resp = requests.post(api_url, json.dumps(request_json), headers=header)
        print(f"resp body: \n{resp.json()}")
        if "data" in resp.json():
            data = resp.json()["data"]
            file_to_save = open("test_submit.mp3", "wb")
            file_to_save.write(base64.b64decode(data))
    except Exception as e:
        e.with_traceback()

将其中的***********替换成对应的就可以实现

test_submit.mp3

感觉还行!

于是就扩展了第一版代码

第一版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64

def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_type = voice_mapping[selected_voice]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_scale.get(), token_entry.get())
        save_path = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声": "BV001_streaming",
    "通用男声": "BV002_streaming",
    "儒雅青年": "BV102_streaming",
    "知性姐姐-双语": "BV034_streaming",
    "温柔小哥": "BV033_streaming",
    "活泼女声": "BV005_streaming",
    "奶气萌娃": "BV051_streaming",
    "亲切女声": "BV007_streaming",
    "阳光男声": "BV056_streaming",
    "东北老铁": "BV021_streaming"
}

# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)

appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

speed_var = tk.DoubleVar(value=1)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, textvariable=speed_var)
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

root.mainloop()

运行以后是这个样子
2024-04-04T11:19:24.png

做个测试
2024-04-04T11:21:07.png

文件如下

111.mp3

效果还不错对吧

为了做的更好一点,于是添加了度数

第二版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64
import threading

# 文字转语音合成函数
def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

# GUI的转换函数
def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_type = voice_mapping[selected_voice]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_var.get(), token_entry.get())
        save_path = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

# 使用线程显示启动信息提示
def show_startup_message_threaded():
    messagebox.showinfo("信息提示", "xwean.com")

# 启动信息提示线程
def start_startup_message_thread():
    startup_thread = threading.Thread(target=show_startup_message_threaded)
    startup_thread.start()

# 实时更新朗读速度显示,并保留两位小数
def update_speed_label(event=None):
    speed_label.config(text=f"{speed_scale.get():.2f}")

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声": "BV001_streaming",
    "通用男声": "BV002_streaming",
    "儒雅青年": "BV102_streaming",
    "知性姐姐-双语": "BV034_streaming",
    "温柔小哥": "BV033_streaming",
    "活泼女声": "BV005_streaming",
    "奶气萌娃": "BV051_streaming",
    "亲切女声": "BV007_streaming",
    "阳光男声": "BV056_streaming",
    "东北老铁": "BV021_streaming"
}

# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)

appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

speed_var = tk.DoubleVar(value=1.0)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL, command=update_speed_label)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, text="1.00")  # 初始值
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

# 在GUI初始化完成后启动信息提示线程
root.after(100, start_startup_message_thread)

root.mainloop()

第二版 主要优化了朗读速度的数值显示问题,并且做了打开提示内容信息

2024-04-04T11:23:08.png
添加了提示框

2024-04-04T11:23:36.png
也增加了滑动数值,下面试试

2024-04-04T11:25:18.png
3.0的速度
7766.mp3
然后是1.5 的速度

6633.mp3

感觉还行
但是又发现了一个新的问题
似乎每次下载都要手动输入以下名字

第三版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64
import threading
from datetime import datetime

# 文字转语音合成函数
def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

# GUI的转换函数
def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_info = voice_mapping[selected_voice]
    voice_type = voice_info["voice_type"]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_var.get(), token_entry.get())
        default_filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S.mp3")
        save_path = filedialog.asksaveasfilename(initialfile=default_filename, defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

# 使用线程显示启动信息提示
def show_startup_message_threaded():
    messagebox.showinfo("信息提示", "xwean.com")

# 启动信息提示线程
def start_startup_message_thread():
    startup_thread = threading.Thread(target=show_startup_message_threaded)
    startup_thread.start()

# 实时更新朗读速度显示,并保留两位小数
def update_speed_label(event=None):
    speed_label.config(text=f"{speed_scale.get():.2f}")

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声-通用场景-中文": {"voice_type": "BV001_streaming", "场景": "通用场景", "语种": "中文"},
    "通用男声-通用场景-中文": {"voice_type": "BV002_streaming", "场景": "通用场景", "语种": "中文"},
    "日语男声-多语种-日语": {"voice_type": "BV524_streaming", "场景": "多语种", "语种": "日语"},
    "甜宠少御-有声阅读-中文": {"voice_type": "BV113_streaming", "场景": "有声阅读", "语种": "中文"},
    "古风少御-有声阅读-中文": {"voice_type": "BV115_streaming", "场景": "有声阅读", "语种": "中文"},
    "炀炀-通用场景-中文": {"voice_type": "BV705_streaming", "场景": "通用场景", "语种": "中文"},
    "重庆小伙-方言-重庆话": {"voice_type": "BV019_streaming", "场景": "方言", "语种": "重庆话"},
    "广西表哥-方言-广西普通话": {"voice_type": "BV213_streaming", "场景": "方言", "语种": "广西普通话"},
    "气质女生-多语种-日语": {"voice_type": "BV522_streaming", "场景": "多语种", "语种": "日语"},
    "通用赘婿-有声阅读-中文": {"voice_type": "BV119_streaming", "场景": "有声阅读", "语种": "中文"},
    "擎苍-有声阅读-中文": {"voice_type": "BV701_streaming", "场景": "有声阅读", "语种": "中文"},
    "活力男声-Jackson-美式发音-英语": {"voice_type": "BV504_streaming", "场景": "美式发音", "语种": "英语"},
    "灿灿-通用场景-中文": {"voice_type": "BV700_streaming", "场景": "通用场景", "语种": "中文"},
    "活力女声-Ariana-美式发音-英语": {"voice_type": "BV503_streaming", "场景": "美式发音", "语种": "英语"},
    "儒雅青年-有声阅读-中文": {"voice_type": "BV102_streaming", "场景": "有声阅读", "语种": "中文"},
    "知性姐姐-双语-教育场景-中文": {"voice_type": "BV034_streaming", "场景": "教育场景", "语种": "中文"},
    "温柔小哥-教育场景-中文": {"voice_type": "BV033_streaming", "场景": "教育场景", "语种": "中文"},
    "活泼女声-视频配音-中文": {"voice_type": "BV005_streaming", "场景": "视频配音", "语种": "中文"},
    "奶气萌娃-特色音色-中文": {"voice_type": "BV051_streaming", "场景": "特色音色", "语种": "中文"},
    "亲切女声-客服场景-中文": {"voice_type": "BV007_streaming", "场景": "客服场景", "语种": "中文"},
    "阳光男声-视频配音-中文": {"voice_type": "BV056_streaming", "场景": "视频配音", "语种": "中文"},
    "东北老铁-方言-东北话": {"voice_type": "BV021_streaming", "场景": "方言", "语种": "东北话"}
}


# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)
speed_var = tk.DoubleVar(value=1.0)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL, command=update_speed_label)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, text="1.00")
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

root.after(100, start_startup_message_thread)

root.mainloop()

更新的下载时,自动用时间命名!同时支持22款目前免费的声音,需要的朋友自己去官网搞一下!

OK!

最终教程!


打开

https://www.volcengine.com/
2024-04-04T11:31:51.png

该注册的去注册,然后直接登录,建议先储值2元钱 以防欠费

2024-04-04T11:32:45.png

也可以先购买优惠包,但建议后买

2024-04-04T11:34:05.png

创建
2024-04-04T11:34:50.png

选择
2024-04-04T11:35:39.png

其实也可以多选 或者 都选 【但没测试过】
2024-04-04T11:36:10.png

这是自己创建的

2024-04-04T11:36:43.png

短语音 直接点击 语音合成 长的点击 长的
2024-04-04T11:37:39.png
2024-04-04T11:38:06.png

重要的
1 音色购买,要是没有 就无法使用

2024-04-04T11:38:55.png

免费的直接全部开开!

完事以后
2024-04-04T11:39:38.png

出现声音名称 和 编号

也就是这个地方

2024-04-04T11:40:23.png

不过,第三版的已经整理完了,拿来就用 时间截止到2024年4月4日的所有免费声音

开始填入

2024-04-04T11:42:04.png

然后就能合成了

其实长语言这地方没搞懂,没有Cluster ID估计是自动的吧,,,

教程到这里就结束了

封装

2024-04-04T11:49:31.png

2024-04-04T11:49:42.png

2024-04-04T11:49:55.png

封装的是第三版

链接:https://pan.baidu.com/s/1sFM_9Wf1UGgIxxfnDPLXIw?pwd=kxpp
提取码:kxpp

下载就能用



嫌麻烦?有付费版本的

https://console.volcengine.com/accp/works-management
2024-04-04T11:51:58.png
2024-04-04T11:52:07.png
2024-04-04T11:52:15.png

这上面所有的声音都能用,需要什么看个人

可以对比一下 魔音 和 逗哥

下课!

火山引擎 语音合成 完全自费 亲测好用!懒癌患者滑到最后!

火山引擎 语音合成 完全自费 亲测好用!

说在前面

不是魔音和逗哥用不起,而是自研更有性价比

为此,找到了最简单的demo【官方提供】

#coding=utf-8

'''
requires Python 3.6 or later
pip install requests
'''
import base64
import json
import uuid
import requests

# 填写平台申请的appid, access_token以及cluster
appid = "************"
access_token= "***********"
cluster = "volcano_tts"

voice_type = "BV102_streaming"
host = "openspeech.bytedance.com"
api_url = f"https://{host}/api/v1/tts"

header = {"Authorization": f"Bearer;{access_token}"}

request_json = {
    "app": {
        "appid": appid,
        "token": "access_token",
        "cluster": cluster
    },
    "user": {
        "uid": "********"
    },
    "audio": {
        "voice_type": voice_type,
        "encoding": "mp3",
        "speed_ratio": 1.0,
        "volume_ratio": 1.0,
        "pitch_ratio": 1.0,
    },
    "request": {
        "reqid": str(uuid.uuid4()),
        "text": "字节跳动语音合成",
        "text_type": "plain",
        "operation": "query",
        "with_frontend": 1,
        "frontend_type": "unitTson"

    }
}

if __name__ == '__main__':
    try:
        resp = requests.post(api_url, json.dumps(request_json), headers=header)
        print(f"resp body: \n{resp.json()}")
        if "data" in resp.json():
            data = resp.json()["data"]
            file_to_save = open("test_submit.mp3", "wb")
            file_to_save.write(base64.b64decode(data))
    except Exception as e:
        e.with_traceback()

将其中的***********替换成对应的就可以实现

test_submit.mp3

感觉还行!

于是就扩展了第一版代码

第一版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64

def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_type = voice_mapping[selected_voice]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_scale.get(), token_entry.get())
        save_path = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声": "BV001_streaming",
    "通用男声": "BV002_streaming",
    "儒雅青年": "BV102_streaming",
    "知性姐姐-双语": "BV034_streaming",
    "温柔小哥": "BV033_streaming",
    "活泼女声": "BV005_streaming",
    "奶气萌娃": "BV051_streaming",
    "亲切女声": "BV007_streaming",
    "阳光男声": "BV056_streaming",
    "东北老铁": "BV021_streaming"
}

# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)

appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

speed_var = tk.DoubleVar(value=1)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, textvariable=speed_var)
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

root.mainloop()

运行以后是这个样子
2024-04-04T11:19:24.png

做个测试
2024-04-04T11:21:07.png

文件如下

111.mp3

效果还不错对吧

为了做的更好一点,于是添加了度数

第二版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64
import threading

# 文字转语音合成函数
def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

# GUI的转换函数
def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_type = voice_mapping[selected_voice]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_var.get(), token_entry.get())
        save_path = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

# 使用线程显示启动信息提示
def show_startup_message_threaded():
    messagebox.showinfo("信息提示", "xwean.com")

# 启动信息提示线程
def start_startup_message_thread():
    startup_thread = threading.Thread(target=show_startup_message_threaded)
    startup_thread.start()

# 实时更新朗读速度显示,并保留两位小数
def update_speed_label(event=None):
    speed_label.config(text=f"{speed_scale.get():.2f}")

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声": "BV001_streaming",
    "通用男声": "BV002_streaming",
    "儒雅青年": "BV102_streaming",
    "知性姐姐-双语": "BV034_streaming",
    "温柔小哥": "BV033_streaming",
    "活泼女声": "BV005_streaming",
    "奶气萌娃": "BV051_streaming",
    "亲切女声": "BV007_streaming",
    "阳光男声": "BV056_streaming",
    "东北老铁": "BV021_streaming"
}

# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)

appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

speed_var = tk.DoubleVar(value=1.0)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL, command=update_speed_label)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, text="1.00")  # 初始值
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

# 在GUI初始化完成后启动信息提示线程
root.after(100, start_startup_message_thread)

root.mainloop()

第二版 主要优化了朗读速度的数值显示问题,并且做了打开提示内容信息

2024-04-04T11:23:08.png
添加了提示框

2024-04-04T11:23:36.png
也增加了滑动数值,下面试试

2024-04-04T11:25:18.png
3.0的速度
7766.mp3
然后是1.5 的速度

6633.mp3

感觉还行
但是又发现了一个新的问题
似乎每次下载都要手动输入以下名字

第三版代码

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import requests
import uuid
import base64
import threading
from datetime import datetime

# 文字转语音合成函数
def synthesis(text, appid, cluster, voice_type, speed, token):
    url = "https://openspeech.bytedance.com/api/v1/tts"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{token}"
    }
    body = {
        "app": {"appid": appid, "cluster": cluster},
        "user": {"uid": "default_uid"},
        "audio": {
            "voice_type": voice_type,
            "encoding": "mp3",
            "speed_ratio": speed,
            "volume_ratio": 1.0,
            "pitch_ratio": 1.0
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "operation": "query"
        }
    }
    response = requests.post(url, json=body, headers=headers)
    response_data = response.json()
    if response.status_code == 200 and response_data.get("code") == 3000:
        audio_data = base64.b64decode(response_data["data"])
        return audio_data
    else:
        raise Exception(f"合成失败: {response_data.get('message', '未知错误')}")

# GUI的转换函数
def convert_to_speech():
    text = text_input.get("1.0", tk.END).strip()
    if not text:
        messagebox.showwarning("警告", "请输入要转换的文本!")
        return

    selected_voice = voice_type_combobox.get()
    voice_info = voice_mapping[selected_voice]
    voice_type = voice_info["voice_type"]

    try:
        audio_data = synthesis(text, appid_entry.get(), cluster_entry.get(), voice_type, speed_var.get(), token_entry.get())
        default_filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S.mp3")
        save_path = filedialog.asksaveasfilename(initialfile=default_filename, defaultextension=".mp3", filetypes=[("MP3文件", "*.mp3")])
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(audio_data)
            messagebox.showinfo("成功", "音频已成功保存!")
    except Exception as e:
        messagebox.showerror("错误", str(e))

# 使用线程显示启动信息提示
def show_startup_message_threaded():
    messagebox.showinfo("信息提示", "xwean.com")

# 启动信息提示线程
def start_startup_message_thread():
    startup_thread = threading.Thread(target=show_startup_message_threaded)
    startup_thread.start()

# 实时更新朗读速度显示,并保留两位小数
def update_speed_label(event=None):
    speed_label.config(text=f"{speed_scale.get():.2f}")

root = tk.Tk()
root.title("文本转语音")

voice_mapping = {
    "通用女声-通用场景-中文": {"voice_type": "BV001_streaming", "场景": "通用场景", "语种": "中文"},
    "通用男声-通用场景-中文": {"voice_type": "BV002_streaming", "场景": "通用场景", "语种": "中文"},
    "日语男声-多语种-日语": {"voice_type": "BV524_streaming", "场景": "多语种", "语种": "日语"},
    "甜宠少御-有声阅读-中文": {"voice_type": "BV113_streaming", "场景": "有声阅读", "语种": "中文"},
    "古风少御-有声阅读-中文": {"voice_type": "BV115_streaming", "场景": "有声阅读", "语种": "中文"},
    "炀炀-通用场景-中文": {"voice_type": "BV705_streaming", "场景": "通用场景", "语种": "中文"},
    "重庆小伙-方言-重庆话": {"voice_type": "BV019_streaming", "场景": "方言", "语种": "重庆话"},
    "广西表哥-方言-广西普通话": {"voice_type": "BV213_streaming", "场景": "方言", "语种": "广西普通话"},
    "气质女生-多语种-日语": {"voice_type": "BV522_streaming", "场景": "多语种", "语种": "日语"},
    "通用赘婿-有声阅读-中文": {"voice_type": "BV119_streaming", "场景": "有声阅读", "语种": "中文"},
    "擎苍-有声阅读-中文": {"voice_type": "BV701_streaming", "场景": "有声阅读", "语种": "中文"},
    "活力男声-Jackson-美式发音-英语": {"voice_type": "BV504_streaming", "场景": "美式发音", "语种": "英语"},
    "灿灿-通用场景-中文": {"voice_type": "BV700_streaming", "场景": "通用场景", "语种": "中文"},
    "活力女声-Ariana-美式发音-英语": {"voice_type": "BV503_streaming", "场景": "美式发音", "语种": "英语"},
    "儒雅青年-有声阅读-中文": {"voice_type": "BV102_streaming", "场景": "有声阅读", "语种": "中文"},
    "知性姐姐-双语-教育场景-中文": {"voice_type": "BV034_streaming", "场景": "教育场景", "语种": "中文"},
    "温柔小哥-教育场景-中文": {"voice_type": "BV033_streaming", "场景": "教育场景", "语种": "中文"},
    "活泼女声-视频配音-中文": {"voice_type": "BV005_streaming", "场景": "视频配音", "语种": "中文"},
    "奶气萌娃-特色音色-中文": {"voice_type": "BV051_streaming", "场景": "特色音色", "语种": "中文"},
    "亲切女声-客服场景-中文": {"voice_type": "BV007_streaming", "场景": "客服场景", "语种": "中文"},
    "阳光男声-视频配音-中文": {"voice_type": "BV056_streaming", "场景": "视频配音", "语种": "中文"},
    "东北老铁-方言-东北话": {"voice_type": "BV021_streaming", "场景": "方言", "语种": "东北话"}
}


# 界面布局
ttk.Label(root, text="AppID:").grid(column=0, row=0, sticky=tk.W)
appid_entry = ttk.Entry(root)
appid_entry.grid(column=1, row=0, sticky=tk.EW)

ttk.Label(root, text="Cluster:").grid(column=0, row=1, sticky=tk.W)
cluster_entry = ttk.Entry(root)
cluster_entry.grid(column=1, row=1, sticky=tk.EW)

ttk.Label(root, text="Access Token:").grid(column=0, row=2, sticky=tk.W)
token_entry = ttk.Entry(root)
token_entry.grid(column=1, row=2, sticky=tk.EW)

ttk.Label(root, text="音色选择:").grid(column=0, row=3, sticky=tk.W)
voice_type_combobox = ttk.Combobox(root, values=list(voice_mapping.keys()))
voice_type_combobox.grid(column=1, row=3, sticky=tk.EW)
voice_type_combobox.current(0)

ttk.Label(root, text="朗读速度:").grid(column=0, row=4, sticky=tk.W)
speed_var = tk.DoubleVar(value=1.0)
speed_scale = ttk.Scale(root, from_=0.2, to=3, variable=speed_var, orient=tk.HORIZONTAL, command=update_speed_label)
speed_scale.grid(column=1, row=4, sticky=tk.EW)
speed_label = ttk.Label(root, text="1.00")
speed_label.grid(column=2, row=4, sticky=tk.W)

ttk.Label(root, text="文本内容:").grid(column=0, row=5, sticky=tk.NW)
text_input = scrolledtext.ScrolledText(root, height=10)
text_input.grid(column=0, row=6, columnspan=3, sticky=tk.EW)

convert_button = ttk.Button(root, text="转换为语音", command=convert_to_speech)
convert_button.grid(column=0, row=7, columnspan=3, sticky=tk.EW)

root.after(100, start_startup_message_thread)

root.mainloop()

更新的下载时,自动用时间命名!同时支持22款目前免费的声音,需要的朋友自己去官网搞一下!

OK!

最终教程!


打开

https://www.volcengine.com/
2024-04-04T11:31:51.png

该注册的去注册,然后直接登录,建议先储值2元钱 以防欠费

2024-04-04T11:32:45.png

也可以先购买优惠包,但建议后买

2024-04-04T11:34:05.png

创建
2024-04-04T11:34:50.png

选择
2024-04-04T11:35:39.png

其实也可以多选 或者 都选 【但没测试过】
2024-04-04T11:36:10.png

这是自己创建的

2024-04-04T11:36:43.png

短语音 直接点击 语音合成 长的点击 长的
2024-04-04T11:37:39.png
2024-04-04T11:38:06.png

重要的
1 音色购买,要是没有 就无法使用

2024-04-04T11:38:55.png

免费的直接全部开开!

完事以后
2024-04-04T11:39:38.png

出现声音名称 和 编号

也就是这个地方

2024-04-04T11:40:23.png

不过,第三版的已经整理完了,拿来就用 时间截止到2024年4月4日的所有免费声音

开始填入

2024-04-04T11:42:04.png

然后就能合成了

其实长语言这地方没搞懂,没有Cluster ID估计是自动的吧,,,

教程到这里就结束了

封装

2024-04-04T11:49:31.png

2024-04-04T11:49:42.png

2024-04-04T11:49:55.png

封装的是第三版

链接:https://pan.baidu.com/s/1sFM_9Wf1UGgIxxfnDPLXIw?pwd=kxpp
提取码:kxpp

下载就能用



嫌麻烦?有付费版本的

https://console.volcengine.com/accp/works-management
2024-04-04T11:51:58.png
2024-04-04T11:52:07.png
2024-04-04T11:52:15.png

这上面所有的声音都能用,需要什么看个人

可以对比一下 魔音 和 逗哥

下课!


版权属于:hhs 所有,采用《知识共享署名许可协议》进行许可,转载请注明文章来源。

本文链接: https://www.xwean.com/1976.html

赞 (5)

猜您想看

评论区(暂无评论)

这里空空如也,快来评论吧~

我要评论