<?php
namespace ClusteringKMeans;

/**
 * Class KMeans
 * @package ClusteringKMeans
 */

class KMeans
{
    /**
     * k-means clustering
     * index of return is tuple ID, cell is cluster ID
     * ex: [2 1 0 0 2 2] means tuple 0 is cluster 2, tuple 1 is cluster 1, tuple 2 is cluster 0, tuple 3 is cluster 0, etc.
     * an alternative clustering DS to save space is to use the .NET BitArray class
     * make a copy of input data
     * @param double[][] $rawData
     * @param int $numClusters
     * @return int[]
     */

    public static function Cluster($rawData, $numClusters)
    {

        $data = self::Normalized($rawData); // so large values don't dominate

        $changed = true; // was there a change in at least one cluster assignment?
        $success = true; // were all means able to be computed? (no zero-count clusters)

        // init clustering[] to get things started
        // an alternative is to initialize means to randomly selected tuples
        // then the processing loop is
        // loop
        //    update clustering
        //    update means
        // end loop
        $clustering = self::InitClustering(_sizeof($data), $numClusters, 0); // semi-random initialization
        $means = self::Allocate($numClusters, _sizeof($data[0])); // small convenience

        $maxCount = _sizeof($data) * 10; // sanity check
        $t = 0;
        while ($changed == true && $success == true && $ct < $maxCount)
        {
                ++$ct; // k-means typically converges very quickly
                $success = self::UpdateMeans($data, $clustering, $means); // compute new cluster means if possible. no effect if fail
                $changed = self::UpdateClustering($data, $clustering, $means); // (re)assign tuples to clusters. no effect if fail
        }
        // consider adding means[][] as an out parameter - the final means could be computed
        // the final means are useful in some scenarios (e.g., discretization and RBF centroids)
        // and even though you can compute final means from final clustering, in some cases it
        // makes sense to return the means (at the expense of some method signature uglinesss)
        //
        // another alternative is to return, as an out parameter, some measure of cluster goodness
        // such as the average distance between cluster means, or the average distance between tuples in
        // a cluster, or a weighted combination of both
        return $clustering;
    }

    /**
     * normalize raw data by computing (x - mean) / stddev
     * primary alternative is min-max:
     * v' = (v - min) / (max - min)
     * make a copy of input data
     * @param double[][] $rawData
     * @return double[][]
     */

    private static function Normalized($rawData)
    {
        $result = array();
        for ($i = 0; $i < _sizeof($rawData); ++$i)
        {
            $result[$i] = $rawData[$i];
        }
/*
        for ($j = 0; $j < _sizeof($result[0]); ++$j) // each col
        {
            $colSum = 0.0;
            for ($i = 0; $i < _sizeof($result); ++$i)
                $colSum += $result[$i][$j];
            $mean = $colSum / _sizeof($result);

            for ($i = 0; $i < _sizeof($result); ++$i) {
                $result[$i][$j] = ($result[$i][$j]) / $mean;
            }
        }
  */
        /*
        for ($j = 0; $j < _sizeof($result[0]); ++$j) // each col
        {
            $colSum = 0.0;
            for ($i = 0; $i < _sizeof($result); ++$i)
                $colSum += $result[$i][$j];
            $mean = $colSum / _sizeof($result);

            $sum = 0.0;
            for ($i = 0; $i < _sizeof($result); ++$i)
                $sum += ($result[$i][$j] - $mean) * ($result[$i][$j] - $mean);
            $sd = $sum / _sizeof($result);
            for ($i = 0; $i < _sizeof($result); ++$i)
                if ($sd == 0) {
                    $result[$i][$j] = INF;
                } else {
                    $result[$i][$j] = ($result[$i][$j] - $mean) / $sd;
                }
        }*/

        for ($j = 0; $j < _sizeof($result[0]); ++$j) // each col
        {
            $max = -INF;
            $min = INF;
            for ($i = 0; $i < _sizeof($result); ++$i) {
                if ($min > $result[$i][$j]) {
                    $min = $result[$i][$j];
                }
                if ($max < $result[$i][$j]) {
                    $max = $result[$i][$j];
                }
            }

            for ($i = 0; $i < _sizeof($result); ++$i)

                if ($max - $min == 0) {
                    $result[$i][$j] = INF;
                } else {
                    $result[$i][$j] = ($result[$i][$j] - $min) / ($max - $min);
                }
        }

        return $result;
    }

    /**
     * init clustering semi-randomly (at least one tuple in each cluster)
     * consider alternatives, especially k-means++ initialization,
     * or instead of randomly assigning each tuple to a cluster, pick
     * numClusters of the tuples as initial centroids/means then use
     * those means to assign each tuple to an initial cluster.
     * @param int $numTuples
     * @param int $numClusters
     * @param int $randomSeed
     * @return int[]
     */

    private static function InitClustering($numTuples, $numClusters, $randomSeed)
    {
        srand($randomSeed);
        $clustering = array_fill(0, $numTuples, NULL);
        for ($i = 0; $i < $numClusters; ++$i) // make sure each cluster has at least one tuple
            $clustering[$i] = $i;
        for ($i = $numClusters; $i < _sizeof($clustering); ++$i)
            $clustering[$i] = rand(0, $numClusters - 1); // other assignments random
        return $clustering;
    }

    /**
     * convenience matrix allocator for Cluster()
     * @param int $numClusters
     * @param int $numColumns
     * @return double[][]
     */

    private static function Allocate($numClusters, $numColumns)
    {
        $result = array();
        for ($k = 0; $k < $numClusters; ++$k)
            $result[$k] = array_fill(0, $numColumns, NULL);
        return $result;
    }

    /**
     * returns false if there is a cluster that has no tuples assigned to it
     * parameter means[][] is really a ref parameter
     * check existing cluster counts
     * can omit this check if InitClustering and UpdateClustering
     * both guarantee at least one tuple in each cluster (usually true)
     * @param double[][] $data
     * @param int[] $clustering
     * @param double[][] $means
     * @return bool
     */

    private static function UpdateMeans($data, $clustering, &$means)
    {
        $numClusters = _sizeof($means);
        $clusterCounts = array();
        for ($i = 0; $i < _sizeof($data); ++$i)
        {
            $cluster = $clustering[$i];
            ++$clusterCounts[$cluster];
        }

        for ($k = 0; $k < $numClusters; ++$k)
            if ($clusterCounts[$k] == 0)
                return false; // bad clustering. no change to means[][]

        // update, zero-out means so it can be used as scratch matrix
        for ($k = 0; $k < _sizeof($means); ++$k)
            for ($j = 0; $j < _sizeof($means[$k]); ++$j)
                $means[$k][$j] = 0.0;

        for ($i = 0; $i < _sizeof($data); ++$i)
        {
            $cluster = $clustering[$i];
            for ($j = 0; $j < _sizeof($data[$i]); ++$j)
                $means[$cluster][$j] += $data[$i][$j]; // accumulate sum
        }

        for ($k = 0; $k < _sizeof($means); ++$k)
            for ($j = 0; $j < _sizeof($means[$k]); ++$j)
                $means[$k][$j] /= $clusterCounts[$k]; // danger of div by 0
        return true;
    }

    /**
     * (re)assign each tuple to a cluster (closest mean)
     * returns false if no tuple assignments change OR
     * if the reassignment would result in a clustering where
     * one or more clusters have no tuples.
     * @param double[][] $data
     * @param int[] $clustering
     * @param double[][] $means
     * @return bool good clustering and at least one change
     */

    private static function UpdateClustering($data, &$clustering, $means)
    {
        $numClusters = _sizeof($means);
        $changed = false;

        $newClustering = $clustering; // proposed result

        $distances = array(); // distances from curr tuple to each mean

        for ($i = 0; $i < _sizeof($data); ++$i) // walk thru each tuple
        {
            for ($k = 0; $k < $numClusters; ++$k)
                $distances[$k] = self::Distance($data[$i], $means[$k]); // compute distances from curr tuple to all k means

            $newClusterID = self::MinIndex($distances); // find closest mean ID

            if ($newClusterID != $newClustering[$i])
            {
                $changed = true;
                $newClustering[$i] = $newClusterID; // update
            }
        }

        if ($changed == false)
            return false; // no change so bail and don't update clustering[][]

        // check proposed clustering[] cluster counts
        $clusterCounts = array();
        for ($i = 0; $i < _sizeof($data); ++$i)
        {
            $cluster = $newClustering[$i];
            ++$clusterCounts[$cluster];
        }

        for ($k = 0; $k < $numClusters; ++$k)
            if ($clusterCounts[$k] == 0)
                return false; // bad clustering. no change to clustering[][]

        $clustering = $newClustering;
        return true;
    }

    /**
     * Euclidean distance between two vectors for UpdateClustering(), consider alternatives such as Manhattan distance
     * @param double[] $tuple
     * @param double[] $mean
     * @return double $indexOfMin Euclidean distance
     */

    private static function Distance($tuple, $mean)
    {
        $sumSquaredDiffs = 0.0;
        for ($j = 0; $j < _sizeof($tuple); ++$j)
                $sumSquaredDiffs += pow(($tuple[$j] - $mean[$j]), 2);
        return sqrt($sumSquaredDiffs);
    }

    /**
     * Index of smallest value in array, helper for UpdateClustering()
     * @param double[] $distances Массив расстояний между кластерами
     * @return int $indexOfMin Индекс минимального расстояния в массиве расстояний
     */
    private static function MinIndex($distances)
    {
        $indexOfMin = 0;
        $smallDist = $distances[0];
        for ($k = 0; $k < _sizeof($distances); ++$k)
        {
            if ($distances[$k] < $smallDist)
            {
                    $smallDist = $distances[$k];
                    $indexOfMin = $k;
            }
        }
        return $indexOfMin;
    }
}
?>
