650 lines
23 KiB
JavaScript
650 lines
23 KiB
JavaScript
import { saveTtsProviderSettings } from './index.js';
|
|
|
|
export { ChatterboxTtsProvider };
|
|
|
|
class ChatterboxTtsProvider {
|
|
//########//
|
|
// Config //
|
|
//########//
|
|
|
|
settings = {};
|
|
constructor() {
|
|
// Initialize with default settings
|
|
this.settings = {
|
|
provider_endpoint: this.settings.provider_endpoint || 'http://localhost:8004',
|
|
voice_mode: this.settings.voice_mode || 'predefined',
|
|
predefined_voice: this.settings.predefined_voice || 'S1',
|
|
reference_voice: this.settings.reference_voice || '',
|
|
temperature: this.settings.temperature || 0.8,
|
|
exaggeration: this.settings.exaggeration || 0.5,
|
|
cfg_weight: this.settings.cfg_weight || 0.5,
|
|
seed: this.settings.seed || -1,
|
|
speed_factor: this.settings.speed_factor || 1.0,
|
|
language: this.settings.language || 'en',
|
|
split_text: this.settings.split_text || true,
|
|
chunk_size: this.settings.chunk_size || 120,
|
|
output_format: this.settings.output_format || 'wav',
|
|
voiceMap: this.settings.voiceMap || {},
|
|
};
|
|
}
|
|
|
|
ready = false;
|
|
voices = [];
|
|
separator = '. ';
|
|
audioElement = document.createElement('audio');
|
|
|
|
languageLabels = {
|
|
'English': 'en',
|
|
'Spanish': 'es',
|
|
'French': 'fr',
|
|
'German': 'de',
|
|
'Italian': 'it',
|
|
'Portuguese': 'pt',
|
|
'Polish': 'pl',
|
|
'Turkish': 'tr',
|
|
'Russian': 'ru',
|
|
'Dutch': 'nl',
|
|
'Czech': 'cs',
|
|
'Arabic': 'ar',
|
|
'Chinese': 'zh-cn',
|
|
'Japanese': 'ja',
|
|
'Korean': 'ko',
|
|
'Hindi': 'hi',
|
|
};
|
|
|
|
get settingsHtml() {
|
|
let html = `<div class="chatterbox-settings-container">
|
|
<div class="chatterbox-settings-header">
|
|
<h3>Chatterbox TTS Settings</h3>
|
|
<div class="status-indicator">
|
|
Status: <span id="chatterbox-status" class="offline">Offline</span>
|
|
</div>
|
|
</div>`;
|
|
|
|
// Server endpoint
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-endpoint">Server Endpoint:</label>
|
|
<input id="chatterbox-endpoint" type="text" class="text_pole" value="${this.settings.provider_endpoint}" />
|
|
</div>`;
|
|
|
|
// Language selection
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-language">Language:</label>
|
|
<select id="chatterbox-language">`;
|
|
for (let language in this.languageLabels) {
|
|
html += `<option value="${this.languageLabels[language]}" ${this.languageLabels[language] === this.settings.language ? 'selected' : ''}>${language}</option>`;
|
|
}
|
|
html += `</select>
|
|
</div>`;
|
|
|
|
// Generation parameters
|
|
html += `<div class="chatterbox-params-section">
|
|
<h4>Generation Parameters</h4>`;
|
|
|
|
// Temperature
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-temperature">Temperature: <span id="chatterbox-temperature-value">${this.settings.temperature}</span></label>
|
|
<input id="chatterbox-temperature" type="range" min="0" max="1" step="0.1" value="${this.settings.temperature}" />
|
|
</div>`;
|
|
|
|
// Exaggeration
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-exaggeration">Exaggeration: <span id="chatterbox-exaggeration-value">${this.settings.exaggeration}</span></label>
|
|
<input id="chatterbox-exaggeration" type="range" min="0" max="2" step="0.1" value="${this.settings.exaggeration}" />
|
|
</div>`;
|
|
|
|
// CFG Weight
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-cfg-weight">CFG Weight: <span id="chatterbox-cfg-weight-value">${this.settings.cfg_weight}</span></label>
|
|
<input id="chatterbox-cfg-weight" type="range" min="0" max="1" step="0.1" value="${this.settings.cfg_weight}" />
|
|
</div>`;
|
|
|
|
// Speed Factor
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-speed">Speed Factor: <span id="chatterbox-speed-value">${this.settings.speed_factor}</span></label>
|
|
<input id="chatterbox-speed" type="range" min="0.5" max="2" step="0.1" value="${this.settings.speed_factor}" />
|
|
</div>`;
|
|
|
|
// Seed
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-seed">Seed (-1 for random):</label>
|
|
<input id="chatterbox-seed" class="text_pole" type="number" min="-1" value="${this.settings.seed}" />
|
|
</div>`;
|
|
|
|
// Text chunking
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label class="checkbox_label">
|
|
<input type="checkbox" id="chatterbox-split-text" ${this.settings.split_text ? 'checked' : ''} />
|
|
Split long texts into chunks
|
|
</label>
|
|
</div>`;
|
|
|
|
// Chunk size
|
|
html += `<div class="chatterbox-setting-row" id="chunk-size-row" ${!this.settings.split_text ? 'style="display: none;"' : ''}>
|
|
<label for="chatterbox-chunk-size">Chunk Size:</label>
|
|
<input id="chatterbox-chunk-size" class="text_pole" type="number" min="50" max="500" value="${this.settings.chunk_size}" />
|
|
</div>`;
|
|
|
|
// Output format
|
|
html += `<div class="chatterbox-setting-row">
|
|
<label for="chatterbox-format">Output Format:</label>
|
|
<select id="chatterbox-format">
|
|
<option value="wav" ${this.settings.output_format === 'wav' ? 'selected' : ''}>WAV</option>
|
|
<option value="opus" ${this.settings.output_format === 'opus' ? 'selected' : ''}>Opus</option>
|
|
</select>
|
|
</div>`;
|
|
|
|
html += '</div>'; // End params section
|
|
|
|
// Footer with links
|
|
html += `<div class="chatterbox-footer">
|
|
<a href="${this.settings.provider_endpoint}" target="_blank">Chatterbox Web UI</a> |
|
|
<a href="https://github.com/devnen/Chatterbox-TTS-Server" target="_blank">Documentation</a>
|
|
</div>`;
|
|
|
|
html += '</div>'; // End container
|
|
|
|
// Add CSS styles
|
|
html += `<style>
|
|
.chatterbox-settings-container {
|
|
padding: 10px;
|
|
}
|
|
.chatterbox-settings-header {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
align-items: center;
|
|
margin-bottom: 15px;
|
|
}
|
|
.chatterbox-settings-header h3 {
|
|
margin: 0;
|
|
}
|
|
.chatterbox-settings-container .status-indicator {
|
|
font-weight: bold;
|
|
}
|
|
#chatterbox-status.ready { color: #4CAF50; }
|
|
#chatterbox-status.offline { color: #f44336; }
|
|
#chatterbox-status.processing { color: #2196F3; }
|
|
.chatterbox-setting-row {
|
|
margin-bottom: 10px;
|
|
display: flex;
|
|
align-items: center;
|
|
gap: 10px;
|
|
}
|
|
.chatterbox-setting-row label {
|
|
flex: 0 0 150px;
|
|
}
|
|
.chatterbox-setting-row label.checkbox_label {
|
|
flex-basis: auto;
|
|
}
|
|
.chatterbox-setting-row input[type="text"],
|
|
.chatterbox-setting-row input[type="number"],
|
|
.chatterbox-setting-row select {
|
|
flex: 1;
|
|
}
|
|
.chatterbox-setting-row input[type="range"] {
|
|
flex: 1;
|
|
}
|
|
.chatterbox-params-section {
|
|
margin-top: 15px;
|
|
padding-top: 15px;
|
|
border-top: 1px solid #ccc;
|
|
}
|
|
.chatterbox-params-section h4 {
|
|
margin-top: 0;
|
|
margin-bottom: 10px;
|
|
}
|
|
.chatterbox-footer {
|
|
margin-top: 15px;
|
|
padding-top: 15px;
|
|
border-top: 1px solid #ccc;
|
|
text-align: center;
|
|
font-size: 0.9em;
|
|
}
|
|
</style>`;
|
|
|
|
return html;
|
|
}
|
|
|
|
//######################//
|
|
// Startup & Initialize //
|
|
//######################//
|
|
|
|
async loadSettings(settings) {
|
|
this.updateStatus('Offline');
|
|
|
|
if (Object.keys(settings).length === 0) {
|
|
console.info('Using default Chatterbox TTS Provider settings');
|
|
} else {
|
|
// Populate settings with provided values
|
|
for (const key in settings) {
|
|
if (key in this.settings) {
|
|
this.settings[key] = settings[key];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update UI elements
|
|
this.updateUIFromSettings();
|
|
|
|
console.debug('ChatterboxTTS: Settings loaded');
|
|
|
|
try {
|
|
// Check if TTS provider is ready
|
|
await this.checkReady();
|
|
|
|
if (this.ready) {
|
|
// Fetch all voice types for the voice map
|
|
await this.fetchTtsVoiceObjects();
|
|
this.updateStatus('Ready');
|
|
}
|
|
|
|
this.setupEventListeners();
|
|
|
|
} catch (error) {
|
|
console.error('Error loading Chatterbox settings:', error);
|
|
this.updateStatus('Offline');
|
|
}
|
|
}
|
|
|
|
updateUIFromSettings() {
|
|
$('#chatterbox-endpoint').val(this.settings.provider_endpoint);
|
|
$('#chatterbox-language').val(this.settings.language);
|
|
$('#chatterbox-temperature').val(this.settings.temperature);
|
|
$('#chatterbox-temperature-value').text(this.settings.temperature);
|
|
$('#chatterbox-exaggeration').val(this.settings.exaggeration);
|
|
$('#chatterbox-exaggeration-value').text(this.settings.exaggeration);
|
|
$('#chatterbox-cfg-weight').val(this.settings.cfg_weight);
|
|
$('#chatterbox-cfg-weight-value').text(this.settings.cfg_weight);
|
|
$('#chatterbox-speed').val(this.settings.speed_factor);
|
|
$('#chatterbox-speed-value').text(this.settings.speed_factor);
|
|
$('#chatterbox-seed').val(this.settings.seed);
|
|
$('#chatterbox-split-text').prop('checked', this.settings.split_text);
|
|
$('#chatterbox-chunk-size').val(this.settings.chunk_size);
|
|
$('#chatterbox-format').val(this.settings.output_format);
|
|
|
|
// Show/hide chunk size based on split text
|
|
if (this.settings.split_text) {
|
|
$('#chunk-size-row').show();
|
|
} else {
|
|
$('#chunk-size-row').hide();
|
|
}
|
|
}
|
|
|
|
//##############################//
|
|
// Check Server is Available //
|
|
//##############################//
|
|
|
|
async checkReady() {
|
|
try {
|
|
const response = await fetch(`${this.settings.provider_endpoint}/api/ui/initial-data`);
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP Error Response: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
// Check if we got valid data
|
|
if (data) {
|
|
this.ready = true;
|
|
console.log('Chatterbox TTS service is ready.');
|
|
} else {
|
|
this.ready = false;
|
|
console.log('Chatterbox TTS service returned invalid data.');
|
|
}
|
|
} catch (error) {
|
|
console.error('Error checking Chatterbox TTS service readiness:', error);
|
|
this.ready = false;
|
|
}
|
|
}
|
|
|
|
//######################//
|
|
// Get Available Voices //
|
|
//######################//
|
|
|
|
async fetchTtsVoiceObjects() {
|
|
try {
|
|
// Always fetch predefined voices
|
|
const predefinedResponse = await fetch(`${this.settings.provider_endpoint}/get_predefined_voices`);
|
|
if (!predefinedResponse.ok) {
|
|
throw new Error(`HTTP ${predefinedResponse.status}: ${predefinedResponse.statusText}`);
|
|
}
|
|
|
|
const predefinedData = await predefinedResponse.json();
|
|
|
|
// Transform predefined voices
|
|
const predefinedVoices = predefinedData.map(voice => ({
|
|
name: voice.display_name,
|
|
voice_id: voice.voice_id || voice.filename,
|
|
preview_url: null,
|
|
lang: voice.language || 'en',
|
|
}));
|
|
|
|
// Always try to fetch reference voices
|
|
let referenceVoices = [];
|
|
try {
|
|
const refResponse = await fetch(`${this.settings.provider_endpoint}/get_reference_files`);
|
|
if (refResponse.ok) {
|
|
const refData = await refResponse.json();
|
|
referenceVoices = refData.map(filename => ({
|
|
name: `[Clone] ${filename}`,
|
|
voice_id: `ref_${filename}`,
|
|
preview_url: null,
|
|
lang: 'en',
|
|
}));
|
|
}
|
|
} catch (error) {
|
|
console.warn('Failed to fetch reference voices:', error);
|
|
}
|
|
|
|
// Combine all voices
|
|
this.voices = [...predefinedVoices, ...referenceVoices];
|
|
|
|
console.log(`Loaded ${this.voices.length} voices (${predefinedVoices.length} predefined, ${referenceVoices.length} reference)`);
|
|
return this.voices;
|
|
} catch (error) {
|
|
console.error('Error fetching Chatterbox voices:', error);
|
|
this.voices = [];
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// Alias for internal use
|
|
async fetchVoices() {
|
|
return this.fetchTtsVoiceObjects();
|
|
}
|
|
|
|
//###########################//
|
|
// Setup Event Listeners //
|
|
//###########################//
|
|
|
|
setupEventListeners() {
|
|
// Server endpoint change
|
|
$('#chatterbox-endpoint').on('input', () => {
|
|
this.settings.provider_endpoint = $('#chatterbox-endpoint').val();
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
// Language
|
|
$('#chatterbox-language').on('change', (e) => {
|
|
this.settings.language = e.target.value;
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
// Parameter sliders
|
|
$('#chatterbox-temperature').on('input', (e) => {
|
|
this.settings.temperature = parseFloat(e.target.value);
|
|
$('#chatterbox-temperature-value').text(this.settings.temperature);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
$('#chatterbox-exaggeration').on('input', (e) => {
|
|
this.settings.exaggeration = parseFloat(e.target.value);
|
|
$('#chatterbox-exaggeration-value').text(this.settings.exaggeration);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
$('#chatterbox-cfg-weight').on('input', (e) => {
|
|
this.settings.cfg_weight = parseFloat(e.target.value);
|
|
$('#chatterbox-cfg-weight-value').text(this.settings.cfg_weight);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
$('#chatterbox-speed').on('input', (e) => {
|
|
this.settings.speed_factor = parseFloat(e.target.value);
|
|
$('#chatterbox-speed-value').text(this.settings.speed_factor);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
// Seed
|
|
$('#chatterbox-seed').on('change', (e) => {
|
|
this.settings.seed = parseInt(e.target.value);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
// Text splitting
|
|
$('#chatterbox-split-text').on('change', (e) => {
|
|
this.settings.split_text = e.target.checked;
|
|
if (e.target.checked) {
|
|
$('#chunk-size-row').show();
|
|
} else {
|
|
$('#chunk-size-row').hide();
|
|
}
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
$('#chatterbox-chunk-size').on('change', (e) => {
|
|
this.settings.chunk_size = parseInt(e.target.value);
|
|
this.onSettingsChange();
|
|
});
|
|
|
|
// Output format
|
|
$('#chatterbox-format').on('change', (e) => {
|
|
this.settings.output_format = e.target.value;
|
|
this.onSettingsChange();
|
|
});
|
|
}
|
|
|
|
//#############################//
|
|
// Store ST interface settings //
|
|
//#############################//
|
|
|
|
onSettingsChange() {
|
|
// Save the updated settings
|
|
saveTtsProviderSettings();
|
|
}
|
|
|
|
//#########################//
|
|
// Handle Reload button //
|
|
//#########################//
|
|
|
|
async onRefreshClick() {
|
|
try {
|
|
this.updateStatus('Processing');
|
|
await this.checkReady();
|
|
|
|
if (this.ready) {
|
|
await this.fetchTtsVoiceObjects();
|
|
this.updateStatus('Ready');
|
|
} else {
|
|
this.updateStatus('Offline');
|
|
}
|
|
} catch (error) {
|
|
console.error('Error during refresh:', error);
|
|
this.updateStatus('Offline');
|
|
}
|
|
}
|
|
|
|
//##################//
|
|
// Preview Voice //
|
|
//##################//
|
|
|
|
async previewTtsVoice(voiceId) {
|
|
try {
|
|
this.updateStatus('Processing');
|
|
|
|
const previewText = 'Hello! This is a preview of the selected voice.';
|
|
|
|
// Determine if this is a reference voice
|
|
let isReferenceVoice = false;
|
|
let actualVoiceId = voiceId;
|
|
|
|
if (voiceId && voiceId.startsWith('ref_')) {
|
|
isReferenceVoice = true;
|
|
actualVoiceId = voiceId.substring(4); // Remove 'ref_' prefix
|
|
}
|
|
|
|
// Generate preview using the main TTS endpoint
|
|
const requestBody = {
|
|
text: previewText,
|
|
voice_mode: isReferenceVoice ? 'clone' : 'predefined',
|
|
temperature: this.settings.temperature,
|
|
exaggeration: this.settings.exaggeration,
|
|
cfg_weight: this.settings.cfg_weight,
|
|
seed: this.settings.seed >= 0 ? this.settings.seed : Math.floor(Math.random() * 2147483648), // Use random seed if -1
|
|
speed_factor: this.settings.speed_factor,
|
|
language: this.settings.language,
|
|
split_text: false, // Don't split for preview
|
|
output_format: this.settings.output_format,
|
|
};
|
|
|
|
// Add voice-specific parameters
|
|
if (isReferenceVoice) {
|
|
requestBody.reference_audio_filename = actualVoiceId;
|
|
} else {
|
|
requestBody.predefined_voice_id = actualVoiceId;
|
|
}
|
|
|
|
const response = await fetch(`${this.settings.provider_endpoint}/tts`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify(requestBody),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
}
|
|
|
|
// Get the audio blob and play it
|
|
const audioBlob = await response.blob();
|
|
const audioUrl = URL.createObjectURL(audioBlob);
|
|
|
|
const audio = new Audio(audioUrl);
|
|
audio.addEventListener('ended', () => {
|
|
URL.revokeObjectURL(audioUrl);
|
|
this.updateStatus('Ready');
|
|
});
|
|
|
|
await audio.play();
|
|
|
|
} catch (error) {
|
|
console.error('Error previewing voice:', error);
|
|
this.updateStatus('Ready');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
//#####################//
|
|
// Get Voice Object //
|
|
//#####################//
|
|
|
|
async getVoice(voiceName) {
|
|
// Ensure voices are loaded
|
|
if (this.voices.length === 0) {
|
|
await this.fetchTtsVoiceObjects();
|
|
}
|
|
|
|
// Find the voice object by name or voice_id
|
|
let match = this.voices.find(voice =>
|
|
voice.name === voiceName ||
|
|
voice.voice_id === voiceName ||
|
|
voice.display_name === voiceName,
|
|
);
|
|
|
|
if (!match) {
|
|
console.warn(`Voice not found: ${voiceName}`);
|
|
// Check if it's a reference voice that wasn't in the list
|
|
if (voiceName && voiceName.startsWith('ref_')) {
|
|
const filename = voiceName.substring(4);
|
|
return {
|
|
name: `[Clone] ${filename}`,
|
|
voice_id: voiceName,
|
|
preview_url: null,
|
|
lang: 'en',
|
|
};
|
|
}
|
|
// Return a default voice object
|
|
return {
|
|
name: voiceName || 'Default',
|
|
voice_id: voiceName || this.settings.predefined_voice || 'S1',
|
|
preview_url: null,
|
|
lang: 'en',
|
|
};
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
//##################//
|
|
// Generate TTS //
|
|
//##################//
|
|
|
|
async generateTts(inputText, voiceId) {
|
|
try {
|
|
this.updateStatus('Processing');
|
|
|
|
// Determine if this is a reference voice
|
|
let isReferenceVoice = false;
|
|
let actualVoiceId = voiceId;
|
|
|
|
if (voiceId && voiceId.startsWith('ref_')) {
|
|
isReferenceVoice = true;
|
|
actualVoiceId = voiceId.substring(4); // Remove 'ref_' prefix
|
|
}
|
|
|
|
// Prepare the request body
|
|
const requestBody = {
|
|
text: inputText,
|
|
voice_mode: isReferenceVoice ? 'clone' : 'predefined',
|
|
temperature: this.settings.temperature,
|
|
exaggeration: this.settings.exaggeration,
|
|
cfg_weight: this.settings.cfg_weight,
|
|
seed: this.settings.seed >= 0 ? this.settings.seed : Math.floor(Math.random() * 2147483648), // Use random seed if -1
|
|
speed_factor: this.settings.speed_factor,
|
|
language: this.settings.language,
|
|
split_text: this.settings.split_text,
|
|
chunk_size: this.settings.chunk_size,
|
|
output_format: this.settings.output_format,
|
|
};
|
|
|
|
// Add voice-specific parameters
|
|
if (isReferenceVoice) {
|
|
requestBody.reference_audio_filename = actualVoiceId;
|
|
} else {
|
|
requestBody.predefined_voice_id = actualVoiceId || this.settings.predefined_voice;
|
|
}
|
|
|
|
console.log('Generating TTS with params:', requestBody);
|
|
|
|
const response = await fetch(`${this.settings.provider_endpoint}/tts`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'Cache-Control': 'no-cache',
|
|
},
|
|
body: JSON.stringify(requestBody),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const errorText = await response.text();
|
|
console.error('TTS generation error:', errorText);
|
|
throw new Error(`HTTP ${response.status}: ${errorText}`);
|
|
}
|
|
|
|
this.updateStatus('Ready');
|
|
|
|
// Return the response directly - SillyTavern expects a Response object
|
|
return response;
|
|
|
|
} catch (error) {
|
|
console.error('Error in generateTts:', error);
|
|
this.updateStatus('Ready');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
//######################//
|
|
// Update Status //
|
|
//######################//
|
|
|
|
updateStatus(status) {
|
|
const statusElement = document.getElementById('chatterbox-status');
|
|
if (statusElement) {
|
|
statusElement.textContent = status;
|
|
statusElement.className = status.toLowerCase();
|
|
}
|
|
}
|
|
}
|