GameVsJam/3d Prototyp/Assets/Scripts/Text2Speech.cs

258 lines
9.2 KiB
C#
Raw Normal View History

2024-04-04 11:10:57 +02:00
using UnityEngine;
using UnityEngine.Networking;
using System;
using System.IO;
using System.Text;
using Newtonsoft.Json;
using System.Collections;
using OpenAI_API;
using OpenAI_API.Chat;
using OpenAI_API.Models;
using System.Threading.Tasks;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
[Serializable]
public class TextToSpeechResponse
{
public string audioContent;
}
public class Text2Speech : MonoBehaviour
{
public string context = "going for a walk, falling, explosion, blood";
public string gender = "MALE";
public JToken voice = null;
public double speakingSpeed = 1.1;
public bool playSound = false;
public bool generate = false;
private AudioSource _audioSource;
2024-04-06 16:55:30 +02:00
private readonly string _googelCloudApiKey = "AIzaSyCVfKH5YOH9gcPamTtbeX5nPj9wWkKB1y4";
2024-04-04 11:10:57 +02:00
private readonly string _google_CloudApiUrl = "https://texttospeech.googleapis.com/v1/text:synthesize";
private readonly string _outputPath = "path/to/audio.wav";
2024-04-04 11:10:57 +02:00
private string _tmpPath = "tmp_audio.wav";
private OpenAIAPI _openAiApi;
private Conversation? _conversation;
2024-04-06 16:55:30 +02:00
private readonly string _openAiApiKey = "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";
private readonly string _prompt = "Write a short text for a Developer as an NPC in a game. The Developer works at a small gamedevelopement office and its manager is called Gottfried who is responsable for all the Developers needs. The text should be based on the following bullet-point context, which describes the events of the last moments. Remember to only respond with the short text that only this ONE Developer should speak and nothing else! The context is: ";
2024-04-04 11:10:57 +02:00
void Start()
{
_tmpPath = "tmp_audio_" + GetInstanceID().ToString() + ".wav";
_audioSource = GetComponent<AudioSource>();
}
public void Update()
{
if (playSound)
{
playSound = false;
_audioSource.Play();
}
if (generate)
{
generate = false;
if (voice == null)
{
GetRandomGermanVoice(gender, (v) => {
voice = v;
//Debug.Log($"GoogleCloud: Choosen voice is\n{voice}");
StartCoroutine(GenerateAndSynthesizeText(context));
});
}
else
{
StartCoroutine(GenerateAndSynthesizeText(context));
}
}
}
public void Generate(string c)
{
context = c;
generate = true;
}
2024-04-04 11:10:57 +02:00
public void GetRandomGermanVoice(string gender, Action<JToken> callback)
{
StartCoroutine(GetRandomGermanVoiceCoroutine(gender, callback));
}
private IEnumerator GetRandomGermanVoiceCoroutine(string gender, Action<JToken> callback)
{
string url = $"https://texttospeech.googleapis.com/v1beta1/voices?key={_googelCloudApiKey}";
using (UnityWebRequest webRequest = UnityWebRequest.Get(url))
{
yield return webRequest.SendWebRequest();
if (webRequest.isNetworkError || webRequest.isHttpError)
{
Debug.LogError(webRequest.error);
callback(null);
}
else
{
JObject response = JObject.Parse(webRequest.downloadHandler.text);
JArray voices = (JArray)response["voices"];
List<JToken> filteredVoices = new List<JToken>();
// Filterung nach deutschen Stimmen und dem spezifizierten Geschlecht
foreach (var v in voices)
{
JArray languageCodes = (JArray)v["languageCodes"];
string languageCode = languageCodes[0].ToString();
string ssmlGender = v["ssmlGender"].ToString();
string name = v["name"].ToString();
if (languageCode.Contains("en-") && ssmlGender == gender && !name.Contains("Standard"))
{
filteredVoices.Add(v);
}
}
if (filteredVoices.Count > 0)
{
// Auswahl einer zuf<75>lligen Stimme aus den gefilterten Ergebnissen
var randomVoice = filteredVoices[UnityEngine.Random.Range(0, filteredVoices.Count)];
System.Random random = new System.Random();
double u1 = 1.0 - random.NextDouble();
double u2 = 1.0 - random.NextDouble();
double randStdNormal = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2);
double mean = 0;
double stdDev = 4;
double randNormal = mean + stdDev * randStdNormal;
double clampedRandNormal = Math.Max(Math.Min(randNormal, 20), -20);
int finalResult = (int)Math.Round(clampedRandNormal);
randomVoice["pitch"] = finalResult;
callback(randomVoice);
}
else
{
Debug.LogError("GoogleCloud: No matching voice found.");
callback(null);
}
}
}
}
IEnumerator GenerateAndSynthesizeText(string context)
{
var generateTextTask = GenerateText(context);
yield return new WaitUntil(() => generateTextTask.IsCompleted);
if (generateTextTask.IsFaulted)
{
Debug.LogError(generateTextTask.Exception.ToString());
}
else
{
string chatPGTresponse = generateTextTask.Result;
StartCoroutine(SynthesizeSpeech(chatPGTresponse));
}
}
async Task<string> GenerateText(string context)
{
Model model = Model.ChatGPTTurbo;
_openAiApi = new OpenAIAPI(_openAiApiKey);
ChatRequest chatRequest = new ChatRequest
{
Temperature = 0.9,
Model = model
};
_conversation = _openAiApi.Chat.CreateConversation(chatRequest);
_conversation.AppendUserInput(_prompt + context);
string response = await _conversation.GetResponseFromChatbotAsync();
//Debug.Log($"ChatGPT: {response}");
return response;
}
IEnumerator SynthesizeSpeech(string textToSynthesize)
{
var requestObject = new
{
input = new { text = textToSynthesize },
voice = new { languageCode = ((JArray)voice["languageCodes"])[0].ToString(),
name = voice["name"],
ssmlGender = voice["ssmlGender"]},
audioConfig = new { audioEncoding = "LINEAR16",
speakingRate = speakingSpeed,
pitch = voice["pitch"]
}
};
string jsonRequestBody = JsonConvert.SerializeObject(requestObject);
byte[] requestBody = Encoding.UTF8.GetBytes(jsonRequestBody);
using (UnityWebRequest www = new UnityWebRequest(_google_CloudApiUrl + "?key=" + _googelCloudApiKey, "POST"))
{
www.uploadHandler = new UploadHandlerRaw(requestBody);
www.downloadHandler = new DownloadHandlerBuffer();
www.SetRequestHeader("Content-Type", "application/json");
yield return www.SendWebRequest();
if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
{
Debug.LogError("GoogleCloud: Error: " + www.error);
Debug.LogError("GoogleCloud: Response: " + www.downloadHandler.text);
}
else
{
TextToSpeechResponse response = JsonConvert.DeserializeObject<TextToSpeechResponse>(www.downloadHandler.text);
string audioContent = response.audioContent;
SetAudioClip(audioContent);
//Debug.Log("GoogleCloud: Successfully created WAV file");
}
}
}
private void SetAudioClip(string base64AudioContent)
{
byte[] audioBytes = Convert.FromBase64String(base64AudioContent);
string tempFilePath = Path.Combine(Application.temporaryCachePath, _tmpPath);
File.WriteAllBytes(tempFilePath, audioBytes);
StartCoroutine(LoadWavAudio(tempFilePath));
}
IEnumerator LoadWavAudio(string path)
{
using (UnityWebRequest www = UnityWebRequestMultimedia.GetAudioClip("file:///" + path, AudioType.WAV))
{
yield return www.SendWebRequest();
if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
{
Debug.LogError("Fehler beim Laden des AudioClips: " + www.error);
}
else
{
_audioSource.clip = DownloadHandlerAudioClip.GetContent(www);
//Debug.Log("GoogleCloud: Successfully set WAV file as AudioClip");
_audioSource.Play();
}
}
}
private void SaveAudioFile(string base64AudioContent)
{
byte[] audioBytes = Convert.FromBase64String(base64AudioContent);
File.WriteAllBytes(_outputPath, audioBytes);
Debug.Log($"GoogleCloud: Successfully saved WAV file as {_outputPath}");
}
}