{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Laden der Rohdaten" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "\n", "# Laden der 'kirp' Liste aus der Pickle-Datei\n", "with open('rick.pickle', 'rb') as f:\n", " data_frame = pickle.load(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Aktiviere Cuda Support" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "device = \"cpu\"\n", "if torch.cuda.is_available():\n", " print(\"CUDA is available on your system.\")\n", " device = \"cuda\"\n", "else:\n", " print(\"CUDA is not available on your system.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# PCA Klasse zu Reduktion der Dimensionen" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "import torch\n", "import pandas as pd\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from typing import List, Tuple, Dict\n", "\n", "\n", "class GenomeDataset(Dataset):\n", " \"\"\"\n", " Eine benutzerdefinierte Dataset-Klasse, die für die Handhabung von Genomdaten konzipiert ist.\n", " Diese Klasse wendet eine Principal Component Analysis (PCA) auf die Frequenzen der Genome an\n", " und teilt den Datensatz in Trainings- und Validierungsteile auf.\n", "\n", " Attributes:\n", " dataframe (pd.DataFrame): Ein Pandas DataFrame, der die initialen Daten enthält.\n", " train_df (pd.DataFrame): Ein DataFrame, der den Trainingsdatensatz nach der Anwendung von PCA und der Aufteilung enthält.\n", " val_df (pd.DataFrame): Ein DataFrame, der den Validierungsdatensatz nach der Anwendung von PCA und der Aufteilung enthält.\n", "\n", " Methods:\n", " __init__(self, dataframe, n_pca_components=1034, train_size=0.8, split_random_state=42):\n", " Konstruktor für die GenomeDataset Klasse.\n", " _do_PCA(self, frequencies, n_components=1034):\n", " Wendet PCA auf die gegebenen Frequenzen an.\n", " _split_dataset(self, train_size=0.8, random_state=42):\n", " Teilt den DataFrame in Trainings- und Validierungsdatensätze auf.\n", " __getitem__(self, index):\n", " Gibt ein Tupel aus transformierten Frequenzen und dem zugehörigen Krebstyp für einen gegebenen Index zurück.\n", " __len__(self):\n", " Gibt die Gesamtlänge der kombinierten Trainings- und Validierungsdatensätze zurück.\n", " \"\"\"\n", "\n", " def __init__(self, dataframe: pd.DataFrame, n_pca_components: int = 1034, train_size: float = 0.8, split_random_state: int = 42):\n", " \"\"\"\n", " Konstruktor für die GenomeDataset Klasse.\n", "\n", " Parameters:\n", " dataframe (pd.DataFrame): Der DataFrame, der die Genome Frequenzen und Krebsarten enthält.\n", " n_pca_components (int): Die Anzahl der PCA-Komponenten, auf die reduziert werden soll. Standardwert ist 1034.\n", " train_size (float): Der Anteil der Daten, der als Trainingsdaten verwendet werden soll. Standardwert ist 0.8.\n", " split_random_state (int): Der Zufalls-Saatwert, der für die Aufteilung des Datensatzes verwendet wird. Standardwert ist 42.\n", " \"\"\"\n", " self.dataframe = dataframe\n", "\n", " # Umwandlung der Krebsarten in numerische Werte\n", " self.label_encoder = LabelEncoder()\n", " self.dataframe['encoded_cancer_type'] = self.label_encoder.fit_transform(dataframe['cancer_type'])\n", "\n", " # Anwenden der PCA auf die Frequenzen\n", " self.dataframe['pca_frequencies'] = self._do_PCA(self.dataframe['genome_frequencies'].tolist(), n_pca_components)\n", "\n", " # Teilen des DataFrame in Trainings- und Validierungsdatensatz\n", " self._split_dataset(train_size=train_size, random_state=split_random_state)\n", "\n", " def transform_datapoint(self, datapoint: List[float]) -> List[float]:\n", " \"\"\"\n", " Transformiert einen einzelnen Datenpunkt durch Standardisierung und Anwendung der PCA.\n", "\n", " Diese Methode nimmt einen rohen Datenpunkt (eine Liste von Frequenzen), standardisiert ihn mit dem \n", " zuvor angepassten Scaler und wendet dann die PCA-Transformation an, um ihn in den reduzierten \n", " Feature-Raum zu überführen, der für das Training des Modells verwendet wurde.\n", "\n", " Parameters:\n", " datapoint (List[float]): Ein roher Datenpunkt, bestehend aus einer Liste von Frequenzen.\n", "\n", " Returns:\n", " List[float]: Der transformierte Datenpunkt, nach Anwendung der Standardisierung und der PCA.\n", " \"\"\"\n", " # Standardisierung des Datenpunkts\n", " scaled_data_point = self.scaler.transform([datapoint])\n", "\n", " # PCA-Transformation des standardisierten Datenpunkts\n", " pca_transformed_point = self.pca.transform(scaled_data_point)\n", "\n", " return pca_transformed_point.tolist()\n", "\n", " def _do_PCA(self, frequencies: List[List[float]], n_components: int = 1034) -> List[List[float]]:\n", " \"\"\"\n", " Wendet PCA auf die gegebenen Frequenzen an.\n", "\n", " Parameters:\n", " frequencies (List[List[float]]): Die Liste der Frequenzen, auf die die PCA angewendet werden soll.\n", " n_components (int): Die Anzahl der Komponenten für die PCA. Standardwert ist 1034.\n", "\n", " Returns:\n", " List[List[float]]: Eine Liste von Listen, die die transformierten Frequenzen nach der PCA darstellt.\n", " \"\"\"\n", "\n", " # Standardisieren der Frequenzen\n", " self.scaler = StandardScaler()\n", " scaled_frequencies = self.scaler.fit_transform(frequencies)\n", "\n", " # PCA-Instanz erstellen und auf die gewünschte Anzahl von Komponenten reduzieren\n", " self.pca = PCA(n_components=n_components)\n", "\n", " # PCA auf die Frequenzen anwenden\n", " pca_result = self.pca.fit_transform(scaled_frequencies)\n", "\n", " return pca_result.tolist()\n", "\n", " def _split_dataset(self, train_size: float = 0.8, random_state: int = 42):\n", " \"\"\"\n", " Teilt den DataFrame in Trainings- und Validierungsdatensätze auf.\n", "\n", " Parameters:\n", " train_size (float): Der Anteil der Daten, der als Trainingsdaten verwendet werden soll.\n", " random_state (int): Der Zufalls-Saatwert, der für die Aufteilung des Datensatzes verwendet wird.\n", " \"\"\"\n", "\n", " class SplittedDataset(Dataset):\n", " def __init__(self, dataframe):\n", " self.dataframe = dataframe\n", "\n", " # Umwandlung der Genome Frequenzen in Tensoren\n", " self.genome_frequencies = torch.tensor(dataframe['pca_frequencies'].tolist(), dtype=torch.float32)\n", "\n", " # Umwandlung der Krebsarten in numerische Werte\n", " self.label_encoder = LabelEncoder()\n", " self.cancer_types = torch.tensor(dataframe['encoded_cancer_type'].tolist(), dtype=torch.long)\n", "\n", " def __getitem__(self, index):\n", " # Rückgabe eines Tupels aus Genome Frequenzen und dem entsprechenden Krebstyp\n", " return self.genome_frequencies[index], self.cancer_types[index]\n", "\n", " def __len__(self):\n", " return len(self.dataframe)\n", "\n", " # Teilen des DataFrame in Trainings- und Validierungsdatensatz\n", " train_df, val_df = train_test_split(self.dataframe, train_size=train_size) #, random_state=random_state)\n", " self.train_df = SplittedDataset(train_df)\n", " self.val_df = SplittedDataset(val_df)\n", "\n", "\n", " def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:\n", " \"\"\"\n", " Gibt ein Tupel aus transformierten Frequenzen und dem entsprechenden Krebstyp für einen gegebenen Index zurück.\n", "\n", " Parameters:\n", " index (int): Der Index des zu abrufenden Datenelements.\n", "\n", " Returns:\n", " Tuple[torch.Tensor, int]: Ein Tupel, bestehend aus einem Tensor der transformierten Frequenzen und dem zugehörigen Krebstyp.\n", " \"\"\"\n", "\n", " print(self.train_df.shape)\n", " print(self.val_df.shape)\n", " \n", " if index < len(self.train_df):\n", " row = self.train_df.iloc[index]\n", " else:\n", " row = self.val_df.iloc[len(self.train_df) - index]\n", "\n", " pca_frequencies_tensor = torch.tensor(row['pca_frequencies'], dtype=torch.float32)\n", " cancer_type = row['encoded_cancer_type']\n", "\n", " return pca_frequencies_tensor, cancer_type\n", "\n", " def __len__(self) -> int:\n", " \"\"\"\n", " Gibt die Gesamtlänge der kombinierten Trainings- und Validierungsdatensätze zurück.\n", "\n", " Returns:\n", " int: Die Länge der kombinierten Datensätze.\n", " \"\"\"\n", " \n", " return len(self.train_df) + len(self.val_df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Definition des neuronalen Netzes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "import torch.nn.functional as F\n", "\n", "class CancerClassifierNN(nn.Module):\n", " \"\"\"\n", " Eine benutzerdefinierte neuronale Netzwerkklassifikator-Klasse für die Krebsklassifikation.\n", "\n", " Diese Klasse definiert ein mehrschichtiges Perzeptron (MLP), das für die Klassifizierung von Krebsarten\n", " anhand genetischer Frequenzdaten verwendet wird.\n", "\n", " Attributes:\n", " fc1 (nn.Linear): Die erste lineare Schicht des Netzwerks.\n", " fc2 (nn.Linear): Die zweite lineare Schicht des Netzwerks.\n", " fc3 (nn.Linear): Die dritte lineare Schicht des Netzwerks.\n", " fc4 (nn.Linear): Die Ausgabeschicht des Netzwerks.\n", " dropout (nn.Dropout): Ein Dropout-Layer zur Vermeidung von Overfitting.\n", "\n", " Methods:\n", " __init__(self, input_size: int, num_classes: int):\n", " Konstruktor für die CancerClassifierNN Klasse.\n", " forward(self, x: torch.Tensor) -> torch.Tensor:\n", " Definiert den Vorwärtsdurchlauf des Netzwerks.\n", " \"\"\"\n", "\n", " def __init__(self, input_size: int, num_classes: int):\n", " \"\"\"\n", " Konstruktor für die CancerClassifierNN Klasse.\n", "\n", " Parameters:\n", " input_size (int): Die Größe des Input-Features.\n", " num_classes (int): Die Anzahl der Zielklassen.\n", " \"\"\"\n", " super(CancerClassifierNN, self).__init__()\n", " # Definieren der Schichten\n", " self.fc1 = nn.Linear(input_size, input_size) # Eingabeschicht\n", " self.fc2 = nn.Linear(input_size, input_size//2) # Versteckte Schicht\n", " self.fc3 = nn.Linear(input_size//2, input_size//4) # Weitere versteckte Schicht\n", " self.fc4 = nn.Linear(input_size//4, num_classes) # Ausgabeschicht\n", " self.dropout = nn.Dropout(p=0.5) # Dropout\n", "\n", " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", " \"\"\"\n", " Definiert den Vorwärtsdurchlauf des Netzwerks.\n", "\n", " Parameters:\n", " x (torch.Tensor): Der Input-Tensor für das Netzwerk.\n", "\n", " Returns:\n", " torch.Tensor: Der Output-Tensor nach dem Durchlauf durch das Netzwerk.\n", " \"\"\"\n", " x = F.relu(self.fc1(x))\n", " x = self.dropout(x)\n", " x = F.relu(self.fc2(x))\n", " x = self.dropout(x)\n", " x = F.relu(self.fc3(x))\n", " x = self.dropout(x)\n", " x = torch.softmax(self.fc4(x), dim=1) # Oder F.log_softmax(x, dim=1) für Mehrklassenklassifikation\n", " return x" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "import torch.optim as optim\n", "from IPython.display import clear_output\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import os\n", "import pickle\n", "\n", "class ExperimentationalExperiments():\n", "\n", " def __init__(self) -> None:\n", " self.results = None\n", "\n", " def run_single_experiment(self, train_loader: DataLoader, valid_loader: DataLoader, n_pca_components: int, n_epochs: int = 200, learning_rate: int = 0.0005, verbose: bool = True, experiment_num: int = None) -> Tuple:\n", " if not isinstance(n_pca_components, int):\n", " raise TypeError(\"n_pca_components must be an integers!\")\n", "\n", " model = CancerClassifierNN(input_size=n_pca_components, num_classes=3)\n", " model.to(device=device)\n", "\n", " # Verlustfunktion\n", " criterion = nn.CrossEntropyLoss()\n", " # Optimierer\n", " optimizer = optim.Adam(model.parameters(), lr=learning_rate)\n", "\n", " # Listen, um Verluste zu speichern\n", " train_losses = []\n", " valid_losses = []\n", " train_accuracies = []\n", " valid_accuracies = []\n", "\n", " for epoch in range(n_epochs):\n", " model.train()\n", " train_loss = 0.0\n", " correct_predictions = 0\n", " total_predictions = 0\n", "\n", " for i, (inputs, labels) in enumerate(train_loader):\n", " inputs, labels = inputs.to(device), labels.to(device)\n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " loss = criterion(outputs, labels)\n", " loss.backward()\n", " optimizer.step()\n", " train_loss += loss.item()\n", "\n", " # Berechnen der Genauigkeit\n", " _, predicted = torch.max(outputs, 1)\n", " correct_predictions += (predicted == labels).sum().item()\n", " total_predictions += labels.size(0)\n", "\n", " # Durchschnittlicher Trainingsverlust und Genauigkeit\n", " train_loss /= len(train_loader)\n", " train_accuracy = correct_predictions / total_predictions\n", " train_losses.append(train_loss)\n", " train_accuracies.append(train_accuracy)\n", "\n", " # Validierungsverlust und Genauigkeit\n", " model.eval()\n", " valid_loss = 0.0\n", " correct_predictions = 0\n", " total_predictions = 0\n", "\n", " with torch.no_grad():\n", " for inputs, labels in valid_loader:\n", " inputs, labels = inputs.to(device), labels.to(device)\n", " outputs = model(inputs)\n", " loss = criterion(outputs, labels)\n", " valid_loss += loss.item()\n", "\n", " # Berechnen der Genauigkeit\n", " _, predicted = torch.max(outputs, 1)\n", " correct_predictions += (predicted == labels).sum().item()\n", " total_predictions += labels.size(0)\n", "\n", " # Durchschnittlicher Validierungsverlust und Genauigkeit\n", " valid_loss /= len(valid_loader)\n", " valid_accuracy = correct_predictions / total_predictions\n", " valid_losses.append(valid_loss)\n", " valid_accuracies.append(valid_accuracy)\n", "\n", " # Aktualisieren des Graphen\n", " clear_output(wait=True)\n", " fig, ax1 = plt.subplots()\n", "\n", " # Zeichnen der Verlustkurven\n", " ax1.plot(train_losses, label='Trainingsverlust', color='r')\n", " ax1.plot(valid_losses, label='Validierungsverlust', color='b')\n", " ax1.set_xlabel('Epochen')\n", " ax1.set_ylabel('Verlust', color='g')\n", " ax1.tick_params(axis='y', labelcolor='g')\n", "\n", " # Zweite y-Achse für die Genauigkeit\n", " ax2 = ax1.twinx()\n", " ax2.plot(train_accuracies, label='Trainingsgenauigkeit', color='r', linestyle='dashed')\n", " ax2.plot(valid_accuracies, label='Validierungsgenauigkeit', color='b', linestyle='dashed')\n", " ax2.set_ylabel('Genauigkeit', color='g')\n", " ax2.tick_params(axis='y', labelcolor='g')\n", "\n", " # Titel und Legende\n", " plt.title(f'Experiment #{experiment_num}: Trainings- und Validierungsverlust und -genauigkeit über die Zeit mit \\n{n_pca_components}-Hauptkomponenten, Lernrate: {learning_rate}')\n", " fig.tight_layout()\n", "\n", " # Legende außerhalb des Graphen\n", " ax1.legend(loc='upper left', bbox_to_anchor=(1.15, 1))\n", " ax2.legend(loc='upper left', bbox_to_anchor=(1.15, 0.85))\n", "\n", " # Fortschritt anzeigen, falls angegeben\n", " if verbose:\n", " print(f'Experiment #{experiment_num} mit {n_pca_components} PCA components: Epoch [{epoch+1}/{n_epochs}], Trainingsverlust: {train_loss:.4f}, Trainingsgenauigkeit: {train_accuracies[-1]:.4f}, Validierungsverlust: {valid_loss:.4f}, Validierungsgenauigkeit: {valid_accuracies[-1]:.4f}')\n", "\n", " # Plot speichern\n", " name = str(experiment_num) + \".png\" if experiment_num is not None else \"single_experiment.png\"\n", " if not os.path.exists(\"Experiments\"):\n", " os.makedirs(\"Experiments\")\n", " if not os.path.exists(f\"Experiments/{str(n_pca_components)}\"):\n", " os.makedirs(f\"Experiments/{str(n_pca_components)}\")\n", " plt.savefig(f\"Experiments/{str(n_pca_components)}/{name}\", bbox_inches='tight')\n", "\n", " return train_losses, valid_losses, train_accuracies, valid_accuracies\n", "\n", " def run_single_pca_experiment(self, train_loader: DataLoader, valid_loader: DataLoader, n_pca_components: int, n_experiments: int, n_epochs: int = 200, learning_rate: int = 0.0005, verbose: bool = True) -> List:\n", " if not isinstance(n_pca_components, int):\n", " raise TypeError(\"n_pca_components must be an integers!\")\n", "\n", " results = []\n", "\n", " for n in range(n_experiments):\n", " res = self.run_single_experiment(train_loader, valid_loader, n_pca_components, n_epochs=n_epochs, learning_rate=learning_rate, verbose=verbose, experiment_num=n+1)\n", " results.append(res)\n", "\n", " return results\n", " \n", "\n", " def run(self, n_pca_components: List[int], n_experiments: int, n_epochs: int = 200, learning_rate: int = 0.0005, batch_size: int = 64, verbose: bool = True) -> Dict:\n", " if not isinstance(n_pca_components, list):\n", " raise TypeError(\"n_pca_components must be a list of integers!\")\n", "\n", " plt.ioff()\n", " self.n_pca_components = n_pca_components\n", "\n", " results = {}\n", "\n", " for n_pca_comps in n_pca_components:\n", " genome_dataset = GenomeDataset(data_frame, n_pca_components=n_pca_comps)\n", " train_dataset = genome_dataset.train_df\n", " valid_dataset = genome_dataset.val_df\n", "\n", " train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", " valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)\n", "\n", " res = self.run_single_pca_experiment(train_loader, valid_loader, n_pca_comps, n_experiments, n_epochs=n_epochs, learning_rate=learning_rate, verbose=verbose)\n", " results[str(n_pca_comps)] = res\n", "\n", " self.plot_and_save_results(res, n_pca_comps)\n", "\n", " self.results = results\n", "\n", " # Speichern der Daten in einer lokalen Datei\n", " with open('Experiments/results.pickle', 'wb') as f:\n", " pickle.dump(self.results, f)\n", "\n", " plt.ion()\n", "\n", " return results\n", "\n", " def plot_and_save_results(self, results: List[Tuple], n_pca_components: int) -> None:\n", " # Mittelwerte und Standardabweichungen berechnen\n", " train_losses, valid_losses, train_accuracies, valid_accuracies = zip(*results)\n", "\n", " train_losses = np.array(train_losses)\n", " valid_losses = np.array(valid_losses)\n", " train_accuracies = np.array(train_accuracies)\n", " valid_accuracies = np.array(valid_accuracies)\n", "\n", " avg_train_losses = np.mean(train_losses, axis=0)\n", " avg_valid_losses = np.mean(valid_losses, axis=0)\n", " avg_train_acc = np.mean(train_accuracies, axis=0)\n", " avg_valid_acc = np.mean(valid_accuracies, axis=0)\n", "\n", " std_train_losses = np.std(train_losses, axis=0)\n", " std_valid_losses = np.std(valid_losses, axis=0)\n", " std_train_acc = np.std(train_accuracies, axis=0)\n", " std_valid_acc = np.std(valid_accuracies, axis=0)\n", "\n", " # Erstellen von Plots\n", " epochs = range(1, len(avg_train_losses) + 1)\n", "\n", " # Plot für Verluste\n", " plt.clf()\n", " plt.plot(epochs, avg_train_losses, label='Mittlerer Trainingsverlust', color='r')\n", " plt.fill_between(epochs, np.subtract(avg_train_losses, std_train_losses), np.add(avg_train_losses, std_train_losses), color='r', alpha=0.2)\n", " plt.plot(epochs, avg_valid_losses, label='Mittlerer Validierungsverlust', color='b')\n", " plt.fill_between(epochs, np.subtract(avg_valid_losses, std_valid_losses), np.add(avg_valid_losses, std_valid_losses), color='b', alpha=0.2)\n", " plt.title(f'Mittelwert und Standardabweichung der Verluste für {n_pca_components} PCA-Komponenten')\n", " plt.xlabel('Experiment Nummer')\n", " plt.ylabel('Verlust')\n", " plt.legend()\n", " plt.savefig(f\"Experiments/{n_pca_components}/average_losses.png\", bbox_inches='tight')\n", " plt.clf()\n", "\n", " # Plot für Genauigkeiten\n", " plt.plot(epochs, avg_train_acc, label='Mittlere Trainingsgenauigkeit', color='r')\n", " plt.fill_between(epochs, np.subtract(avg_train_acc, std_train_acc), np.add(avg_train_acc, std_train_acc), color='r', alpha=0.2)\n", " plt.plot(epochs, avg_valid_acc, label='Mittlere Validierungsgenauigkeit', color='b')\n", " plt.fill_between(epochs, np.subtract(avg_valid_acc, std_valid_acc), np.add(avg_valid_acc, std_valid_acc), color='b', alpha=0.2)\n", " plt.title(f'Mittelwert und Standardabweichung der Genauigkeiten für {n_pca_components} PCA-Komponenten')\n", " plt.xlabel('Experiment Nummer')\n", " plt.ylabel('Genauigkeit')\n", " plt.legend()\n", " plt.savefig(f\"Experiments/{n_pca_components}/average_accuracies.png\", bbox_inches='tight')\n", " plt.clf()\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "e = ExperimentationalExperiments()\n", "results = e.run([1024, 512, 256, 128, 64, 32, 16], 10, n_epochs=500)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "rl", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.18" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }