From 1a255f1182275f950a22cc98bb3f6e2c12d8f183 Mon Sep 17 00:00:00 2001
From: klappstuhl24 <t.kleipsties@web.de>
Date: Thu, 4 Apr 2024 11:10:57 +0200
Subject: [PATCH] text2speech

---
 3d Prototyp/Assets/Scripts/Text2Speech.cs     | 252 ++++++++++++++++++
 .../Assets/Scripts/Text2Speech.cs.meta        |  11 +
 2 files changed, 263 insertions(+)
 create mode 100644 3d Prototyp/Assets/Scripts/Text2Speech.cs
 create mode 100644 3d Prototyp/Assets/Scripts/Text2Speech.cs.meta
diff --git a/3d Prototyp/Assets/Scripts/Text2Speech.cs b/3d Prototyp/Assets/Scripts/Text2Speech.cs
new file mode 100644
index 00000000..4ad4e1b4
--- /dev/null
+++ b/3d Prototyp/Assets/Scripts/Text2Speech.cs	
@@ -0,0 +1,252 @@
+using UnityEngine;
+using UnityEngine.Networking;
+using System;
+using System.IO;
+using System.Text;
+using Newtonsoft.Json;
+using System.Collections;
+using OpenAI_API;
+using OpenAI_API.Chat;
+using OpenAI_API.Models;
+using System.Threading.Tasks;
+using Newtonsoft.Json.Linq;
+using System.Collections.Generic;
+
+[Serializable]
+public class TextToSpeechResponse
+{
+    public string audioContent;
+}
+
+
+public class Text2Speech : MonoBehaviour
+{
+    public string context = "going for a walk, falling, explosion, blood";
+    public string gender = "MALE";
+    public JToken voice = null;
+    public double speakingSpeed = 1.1;
+    public bool playSound = false;
+    public bool generate = false;
+
+    private AudioSource _audioSource;
+
+    private readonly string _googelCloudApiKey = "AIzaSyDNpkVyAUU4AvSwAErVMlZ1lSvGfpkEs0Q";
+    private readonly string _google_CloudApiUrl = "https://texttospeech.googleapis.com/v1/text:synthesize";
+    private readonly string _outputPath = "C:\\Users\\PC\\VoiceTest\\Assets\\Scirpts\\audio.wav";
+    private string _tmpPath = "tmp_audio.wav";
+
+    private OpenAIAPI _openAiApi;
+    private Conversation? _conversation;
+    private readonly string _openAiApiKey = "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";
+    private readonly string _prompt = "Write a short text for an NPC in a game.The text should be based on the following bullet-point context, which describes the events of the last moments. Remember to only respond with the short text that the ONE NPC should speak! The context is: ";
+
+    void Start()
+    {
+        _tmpPath = "tmp_audio_" + GetInstanceID().ToString() + ".wav";
+        _audioSource = GetComponent<AudioSource>();
+    }
+
+    public void Update()
+    {
+        if (playSound)
+        {
+            playSound = false;
+            _audioSource.Play();
+        }
+        if (generate)
+        {
+            generate = false;
+            if (voice == null)
+            {
+                GetRandomGermanVoice(gender, (v) => {
+                    voice = v;
+                    //Debug.Log($"GoogleCloud: Choosen voice is\n{voice}");
+                    StartCoroutine(GenerateAndSynthesizeText(context));
+                });
+            }
+            else
+            {
+                StartCoroutine(GenerateAndSynthesizeText(context));
+            }
+        }
+    }
+
+    public void GetRandomGermanVoice(string gender, Action<JToken> callback)
+    {
+        StartCoroutine(GetRandomGermanVoiceCoroutine(gender, callback));
+    }
+
+    private IEnumerator GetRandomGermanVoiceCoroutine(string gender, Action<JToken> callback)
+    {
+        string url = $"https://texttospeech.googleapis.com/v1beta1/voices?key={_googelCloudApiKey}";
+
+        using (UnityWebRequest webRequest = UnityWebRequest.Get(url))
+        {
+            yield return webRequest.SendWebRequest();
+
+            if (webRequest.isNetworkError || webRequest.isHttpError)
+            {
+                Debug.LogError(webRequest.error);
+                callback(null);
+            }
+            else
+            {
+                JObject response = JObject.Parse(webRequest.downloadHandler.text);
+                JArray voices = (JArray)response["voices"];
+
+                List<JToken> filteredVoices = new List<JToken>();
+
+                // Filterung nach deutschen Stimmen und dem spezifizierten Geschlecht
+                foreach (var v in voices)
+                {
+                    JArray languageCodes = (JArray)v["languageCodes"];
+                    string languageCode = languageCodes[0].ToString();
+                    string ssmlGender = v["ssmlGender"].ToString();
+                    string name = v["name"].ToString();
+                    if (languageCode.Contains("en-") && ssmlGender == gender && !name.Contains("Standard"))
+                    {
+                        filteredVoices.Add(v);
+                    }
+                }
+
+                if (filteredVoices.Count > 0)
+                {
+                    // Auswahl einer zufälligen Stimme aus den gefilterten Ergebnissen
+                    var randomVoice = filteredVoices[UnityEngine.Random.Range(0, filteredVoices.Count)];
+
+                    System.Random random = new System.Random();
+                    double u1 = 1.0 - random.NextDouble();
+                    double u2 = 1.0 - random.NextDouble();
+                    double randStdNormal = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2);
+                    double mean = 0;
+                    double stdDev = 4;
+                    double randNormal = mean + stdDev * randStdNormal;
+                    double clampedRandNormal = Math.Max(Math.Min(randNormal, 20), -20);
+                    int finalResult = (int)Math.Round(clampedRandNormal);
+                    randomVoice["pitch"] = finalResult;
+
+                    callback(randomVoice);
+                }
+                else
+                {
+                    Debug.LogError("GoogleCloud: No matching voice found.");
+                    callback(null);
+                }
+            }
+        }
+    }
+
+    IEnumerator GenerateAndSynthesizeText(string context)
+    {
+        var generateTextTask = GenerateText(context);
+  
+        yield return new WaitUntil(() => generateTextTask.IsCompleted);
+
+        if (generateTextTask.IsFaulted)
+        {
+            Debug.LogError(generateTextTask.Exception.ToString());
+        }
+        else
+        {
+            string chatPGTresponse = generateTextTask.Result;
+            StartCoroutine(SynthesizeSpeech(chatPGTresponse));
+        }
+    }
+
+    async Task<string> GenerateText(string context)
+    {
+        Model model = Model.ChatGPTTurbo;
+
+        _openAiApi = new OpenAIAPI(_openAiApiKey);
+
+        ChatRequest chatRequest = new ChatRequest
+        {
+            Temperature = 0.9,
+            Model = model
+        };
+
+        _conversation = _openAiApi.Chat.CreateConversation(chatRequest);
+        _conversation.AppendUserInput(_prompt + context);
+        string response = await _conversation.GetResponseFromChatbotAsync();
+        //Debug.Log($"ChatGPT: {response}");
+        return response;
+    }
+
+    IEnumerator SynthesizeSpeech(string textToSynthesize)
+    {
+        var requestObject = new
+        {
+            input = new { text = textToSynthesize },
+            voice = new { languageCode = ((JArray)voice["languageCodes"])[0].ToString(),
+                          name = voice["name"],
+                          ssmlGender = voice["ssmlGender"]},
+            audioConfig = new { audioEncoding = "LINEAR16",
+                                speakingRate = speakingSpeed,
+                                pitch = voice["pitch"]
+            }
+        };
+
+        string jsonRequestBody = JsonConvert.SerializeObject(requestObject);
+        byte[] requestBody = Encoding.UTF8.GetBytes(jsonRequestBody);
+
+        using (UnityWebRequest www = new UnityWebRequest(_google_CloudApiUrl + "?key=" + _googelCloudApiKey, "POST"))
+        {
+            www.uploadHandler = new UploadHandlerRaw(requestBody);
+            www.downloadHandler = new DownloadHandlerBuffer();
+            www.SetRequestHeader("Content-Type", "application/json");
+
+            yield return www.SendWebRequest();
+
+            if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
+            {
+                Debug.LogError("GoogleCloud: Error: " + www.error);
+                Debug.LogError("GoogleCloud: Response: " + www.downloadHandler.text);
+            }
+            else
+            {
+                TextToSpeechResponse response = JsonConvert.DeserializeObject<TextToSpeechResponse>(www.downloadHandler.text);
+                string audioContent = response.audioContent;
+                SetAudioClip(audioContent);
+                //Debug.Log("GoogleCloud: Successfully created WAV file");
+            }
+        }
+    }
+
+    private void SetAudioClip(string base64AudioContent)
+    {
+        byte[] audioBytes = Convert.FromBase64String(base64AudioContent);
+
+        string tempFilePath = Path.Combine(Application.temporaryCachePath, _tmpPath);
+        File.WriteAllBytes(tempFilePath, audioBytes);
+
+        StartCoroutine(LoadWavAudio(tempFilePath));
+        SaveAudioFile(base64AudioContent);
+    }
+
+    IEnumerator LoadWavAudio(string path)
+    {
+        using (UnityWebRequest www = UnityWebRequestMultimedia.GetAudioClip("file:///" + path, AudioType.WAV))
+        {
+            yield return www.SendWebRequest();
+
+            if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
+            {
+                Debug.LogError("Fehler beim Laden des AudioClips: " + www.error);
+            }
+            else
+            {
+                _audioSource.clip = DownloadHandlerAudioClip.GetContent(www);
+                //Debug.Log("GoogleCloud: Successfully set WAV file as AudioClip");
+                _audioSource.Play();
+            }
+        }
+    }
+
+
+    private void SaveAudioFile(string base64AudioContent)
+    {
+        byte[] audioBytes = Convert.FromBase64String(base64AudioContent);
+        File.WriteAllBytes(_outputPath, audioBytes);
+        Debug.Log($"GoogleCloud: Successfully saved WAV file as {_outputPath}");
+    }
+}
diff --git a/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta b/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta
new file mode 100644
index 00000000..1dbd1e73
--- /dev/null
+++ b/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta	
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ef5183dac70a54b4cbed3e05d617524f
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: