{ "cells": [ { "cell_type": "code", "execution_count": 478, "metadata": {}, "outputs": [], "source": [ "# Please enter your names\n", "\n", "names = \"Fabian Langer, Yannik Bretschneider\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Programming Exercise: Single Pass\n", "In this exercise you will need to implement the Single Pass algorithm." ] }, { "cell_type": "code", "execution_count": 479, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Tell matplotlib to plot inside the Notebook\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we need a notion for data points and clusters to make our life a little easier. Complete the missing parts:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class Point:\n", " x = 0\n", " y = 0\n", " \n", " def __init__(self, x = 0, y = 0):\n", " self.x = x\n", " self.y = y\n", " \n", " def distanceTo(self, point):\n", " # Compute Euclidean distance\n", " distance = np.sqrt((self.x - point.x)**2 + (self.y - point.y)**2)\n", " return distance\n", " \n", " def __mul__(self, other):\n", " # Scale vector by a given facter\n", " assert isinstance(other, (float)), \"Unsupported operand type(s) for *: 'float' and '{}'\".format(type(other).__name__)\n", " return Point(self.x * other, self.y * other)\n", "\n", " def __add__(self, other):\n", " # Add two points\n", " assert isinstance(other, Point), \"Unsupported operand type(s) for +: 'Point' and '{}'\".format(type(other).__name__)\n", " return Point(self.x + other.x, self.y + other.y)\n", "\n", "\n", "class Cluster:\n", " centroid = Point() # Centroid point of the cluster\n", " num = 0 # Number of points in the cluster\n", " cid = 0 # Cluster ID\n", "\n", " def __init__(self, centroid: Point = Point(), cid = 0):\n", " self.centroid = centroid\n", " self.cid = cid\n", " self.num = 1\n", " \n", " def addPoint(self, point):\n", " # If there is no centroid yet, use the point as centroid\n", " if not self.centroid:\n", " self.num = 1\n", " self.centroid = Point(point.x, point.y)\n", " # If there is a centroid, update it\n", " else:\n", " self.num += 1\n", " self.centroid = self.centroid * ((self.num - 1) / float(self.num)) + point * (1 / float(self.num))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we need a function to plot datapoints with their assigned cluster color." ] }, { "cell_type": "code", "execution_count": 481, "metadata": {}, "outputs": [], "source": [ "import random\n", "colormap = []\n", "\n", "def get_random_hex_color():\n", " \"\"\"Generates a random hex color code.\"\"\"\n", " def r():\n", " return random.randint(0, 255)\n", " return '#%02X%02X%02X' % (r(),r(),r())\n", "\n", "def initplot():\n", " # Initialize th plot\n", " global colormap\n", " plt.suptitle('SinglePass')\n", " # Initial colors, more will be generated if needed\n", " colormap = ['black','red','blue','green','orange','purple','yellow','pink']\n", " \n", "def plot(point,c):\n", " # Generate random colors if needed\n", " global colormap\n", " while c > len(colormap) - 1:\n", " colormap.append(get_random_hex_color())\n", " # Plot the data point in the color of the cluster\n", " plt.scatter(point.x, point.y, c=colormap[c], s=40)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Write a function that performs the Single Pass Clustering. Since its a streaming based clustering, it will only see one data point at a time. Don't forget to plot each datapoint after assigning it to a cluster." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clusters = []\n", "cid = 1\n", "\n", "def SinglePass(point, threshold):\n", " #Write Single Pass Algorithm here\n", " global clusters\n", " global cid\n", " \n", " point_cid = None # Cluster closest to the point\n", " cid_dist = float('inf') # closest distance so far\n", "\n", " for cluster in clusters:\n", " # Check if cluster is close enough\n", " if (dist := point.distanceTo(cluster.centroid)) < threshold:\n", " # Add point to the cluster (which also move it's centroid)\n", " cluster.addPoint(point)\n", " # If the cluster is the closest so far, remeber the cluster id\n", " if dist < cid_dist:\n", " cid_dist = dist\n", " point_cid = cluster.cid\n", "\n", " # Create a new cluster, if the point wasn't close enough to one of the exsisting ones\n", " if point_cid is None:\n", " clusters.append(Cluster(point, cid))\n", " point_cid = cid\n", " cid += 1\n", "\n", " return (point, point_cid) # return point and its closest cluster id\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Note*: Please download datasets.zip, extract the files.\n", "\n", "Now we set up everything for loading the data." ] }, { "cell_type": "code", "execution_count": 483, "metadata": {}, "outputs": [], "source": [ "filepath = \"./clustering-datasets/\" \n", "filenames = [\"Aggregation\",\"Compound\",\"D31\",\"flame\",\"jain\",\"pathbased\",\"R15\",\"spiral\"]\n", "fileextension = \".txt\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set the parameters:" ] }, { "cell_type": "code", "execution_count": 484, "metadata": {}, "outputs": [], "source": [ "thresholds = {\"Aggregation\": 10,\n", " \"Compound\": 13,\n", " \"D31\": 4,\n", " \"flame\": 8.1,\n", " \"jain\": 13,\n", " \"pathbased\": 10,\n", " \"R15\": 6,\n", " \"spiral\": 7}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we can cluster the data using Single Pass Clustering:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def main(filename):\n", " threshold = thresholds[filename]\n", " initplot()\n", " f = filepath+filename+fileextension\n", " file = open(f,\"r\")\n", " line = file.readline()\n", " while(line != \"\"):\n", " a = line[:-1].split()\n", " x = float(a[0])\n", " y = float(a[1])\n", " nPoint, nCid = SinglePass(Point(x,y),threshold)\n", " plot(nPoint, nCid)\n", " line = file.readline()\n", " file.close()\n", " print(\"Finished clustering dataset: \" + filename)" ] }, { "cell_type": "code", "execution_count": 486, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finished clustering dataset: Compound\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "main(filenames[1])" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 1 }