Добавил:
Upload Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:
диплом_NY_30 (1).doc
Скачиваний:
46
Добавлен:
16.02.2016
Размер:
2.87 Mб
Скачать

Порівняльна характеристика методів Data Mining

Характеристика

Алгоритм

К- ближнього сусіда

Поліноміальні нейронні системи

Дерева рішень

Методи візуалізації

Нейронні мережі

Класичні методи (лінійна регресія)

Точність

низька

висока

низька

висока

висока

нейтральна

Маштабність

дуже низька

нейтральна

висока

дуже низька

низька

висока

Інтерпретованість

висока/ нейтральна

низька

висока

висока

низька

висока/

нейтральна

Пригодність до використання

нейтральна

висока/

нейтральна

висока/

нейтральна

висока

низька

висока

Трудомісткість

нейтральна /

низька

нейтральна / низька

висока

дуже висока

нейтральна

нейтральна

Різносторонність

низька

нейтральна

висока

низька

низька

нейтральна

Швидкість

висока

нейтральна / низька

висока/

нейтральна

надзвичайно низька

дуже низька

висока

Популярність, широта використання

низька

нейтральна

висока

висока

низька

низька

Додаток Б

Лістинг програмного продукту

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.Linq;

using System.Text;

using System.Windows.Forms;

using System.IO;

using Extreme.Statistics.Multivariate;

using Extreme.Statistics;

using Extreme.Mathematics;

namespace ClusterRegression

{

public partial class FormMain : Form

{

KMeansClusterCollection clusters;

CategoricalVariable memberships;

HierarchicalClusterAnalysis hc;

NumericalVariable x1, x2;

NumericalVariable[] variables = null;

VariableCollection collection = null;

Func<Extreme.Mathematics.Vector, Extreme.Mathematics.Vector, double> distanceMeasures;

LinkageMethod linkageMethod;

UnivariateModel model;

int ClustersCount;

string CurrentDirectory;

BufferedGraphicsContext currentContext;

BufferedGraphics GraphicBuffer;

BufferedGraphics DendrogrammaBuffer;

int[] IerarhicalCluster;

int[] KMeansCluster;

public FormMain()

{

InitializeComponent();

CurrentDirectory = Directory.GetCurrentDirectory();

ClustersCountText.Text = "2"; RegressionClusterText.Text = "0";

LinkageMethodList.SelectedIndex = 4;

DistanceList.SelectedIndex = 2;

RegressionMethodList.SelectedIndex = 0;

currentContext = BufferedGraphicsManager.Current;

GraphicBuffer = currentContext.Allocate(GraphicPanel.CreateGraphics(), GraphicPanel.DisplayRectangle);

//DendrogrammaBuffer = currentContext.Allocate(DendrogrammaPanel.CreateGraphics(), DendrogrammaPanel.DisplayRectangle);

OpenFile(CurrentDirectory + "/data.csv");

}

private void ExitButton_Click(object sender, EventArgs e)

{

Close();

}

private void OpenDataButton_Click(object sender, EventArgs e)

{

OpenFileDialog openFileDialog = new OpenFileDialog()

Продовження додатку Б

{

InitialDirectory = CurrentDirectory,

Filter = "*.csv|*.csv"

};

if (openFileDialog.ShowDialog() == DialogResult.OK)

{

OpenFile(openFileDialog.FileName);

}

}

private void OpenFile(string FileName)

{

using (StreamReader Reader = new StreamReader(FileName))

{

string X1Line = Reader.ReadLine();

string X2Line = Reader.ReadLine();

string[] X1 = X1Line.Split(';');

string[] X2 = X2Line.Split(';');

DataTable.Rows.Clear();

for (int i = 0; i < X1.Length; i++)

{

DataTable.Rows.Add(new string[] { X1[i], X2[i] }); }

}

PerfornAnalysis();

}

private void SaveDataButton_Click(object sender, EventArgs e)

{

if (DataTable.Rows.Count == 1)

{

MessageBox.Show("Данные не введены");

return;

}

SaveFileDialog SaveFile = new SaveFileDialog()

{

InitialDirectory = CurrentDirectory,

Filter = "*.csv|*.csv"

};

if (SaveFile.ShowDialog() == DialogResult.OK)

{

using (StreamWriter Writer = new StreamWriter(SaveFile.FileName))

{

string X1Line = "";

string X2Line = "";

for (int i = 0; i < DataTable.Rows.Count - 1; i++)

{

X1Line += DataTable[0, i].Value.ToString() + ";";

X2Line += DataTable[1, i].Value.ToString() + ";";

}

X1Line = X1Line.Substring(0, X1Line.Length - 1);

X2Line = X2Line.Substring(0, X2Line.Length - 1);

Writer.WriteLine(X1Line);

Writer.WriteLine(X2Line);

}

}

}

private void PerfornAnalysis()

{

Продовження додатку Б

try

{

if (DataTable.Rows.Count == 1)

{

//nothing todo return;

}

double[] X1 = new double[DataTable.Rows.Count - 1];

double[] X2 = new double[DataTable.Rows.Count - 1];

for (int i = 0; i < DataTable.Rows.Count - 1; i++)

{

X1[i] = double.Parse(DataTable[0, i].Value.ToString());

X2[i] = double.Parse(DataTable[1, i].Value.ToString());

}

x1 = new NumericalVariable("x1", X1);

x2 = new NumericalVariable("x2", X2);

variables = new NumericalVariable[] { x1, x2 };

collection = new VariableCollection(variables);

hc = new HierarchicalClusterAnalysis(variables);

hc.Standardize = false;

hc.DistanceMeasure = distanceMeasures;

hc.LinkageMethod = linkageMethod;

KlasterAnalysisText.Text = "";

RegressionTable2.Rows.Clear();

RegressionTable.Rows.Clear();

RegressionAnalysisText.Text = "";

if (IerarhicalMethodCheck.Checked)

{

KlasterAnalysisText.Text = "Hierarchical clustering" + "\r\n";

// Compute the model:

hc.Compute();

// We can partition the cases into clusters:

HierarchicalClusterCollection partition = hc.GetClusterPartition(ClustersCount);

// Individual clusters are accessed through an index, or through enumeration.

foreach (HierarchicalCluster cluster in partition)

KlasterAnalysisText.Text += "Cluster " + cluster.Index + " has " + cluster.Size + " members." + "\r\n";

// And get a filter for the observations in a single cluster:

collection.Filter = partition[0].MemberFilter;

KlasterAnalysisText.Text += "Number of items in filtered collection: " + collection.Observations.Count + "\r\n";

collection.Filter = null;

// Get a variable that shows memberships: memberships = partition.GetMemberships();

IerarhicalCluster = new int[memberships.Length];

for (int i = 0; i < memberships.Length; i++)

{

KlasterAnalysisText.Text += "Observation " + i + " belongs to cluster " + memberships.GetLevelIndex(i) + "\r\n";

IerarhicalCluster[i] = memberships.GetLevelIndex(i);

}

// A dendrogram is a graphical representation of the clustering in the form of a tree.

// You can get all the information you need to draw a dendrogram starting from

// the root node of the dendrogram:

DendrogramNode root = hc.DendrogramRoot;

// Position and DistanceMeasure give the x and y coordinates:

KlasterAnalysisText.Text += "Root position: " + root.Position + ", " + root.DistanceMeasure + "\r\n";

// The left and right children:

KlasterAnalysisText.Text += "Position of left child: " + root.LeftChild.Position + "\r\n";

Продовження додатку Б

KlasterAnalysisText.Text += "Position of right child: " + root.RightChild.Position + "\r\n" + "\r\n";

// You can also get a filter that defines a sort order suitable for

// drawing the dendrogram:

Filter sortOrder = hc.GetDendrogramOrder();

}

if (KMethodCheck.Checked)

{

if (ClustersCount > DataTable.Rows.Count - 1 || ClustersCount <= 1)

{

// bad 'ClustersCount'

return;

}

//

// K-Means Clustering

//

KlasterAnalysisText.Text += "K-means clustering" + "\r\n";

// Create the model:

KMeansClusterAnalysis kmc = new KMeansClusterAnalysis(variables, ClustersCount);

// Rescale the variables to their Z-scores before doing the analysis:

kmc.Standardize = false;

// Compute the model:

kmc.Compute();

// We can partition the cases into clusters:

clusters = kmc.GetClusters();

// Individual clusters are accessed through an index, or through enumeration.

foreach (KMeansCluster cluster in clusters)

{

KlasterAnalysisText.Text += "Cluster " + cluster.Index + " has " + cluster.Size + " members. Sum of squares: " +

cluster.SumOfSquares + "\r\n";

KlasterAnalysisText.Text += "Center: " + cluster.Center + "\r\n";

}

// The distances between clusters are also available:

KlasterAnalysisText.Text += kmc.GetClusterDistances().ToString("F4") + "\r\n";

// You can get a filter for the observations in a single cluster:

collection.Filter = clusters[1].MemberFilter;

KlasterAnalysisText.Text += "Number of items in filtered collection: " + collection.Observations.Count + "\r\n";

// Get a variable that shows memberships:

memberships = clusters.GetMemberships();

KMeansCluster = new int[memberships.Length];

// And one that shows the distances to the centers:

NumericalVariable distances = clusters.GetDistancesToCenters();

for (int i = 0; i < memberships.Length; i++)

{

KlasterAnalysisText.Text += "Observation " + i + " belongs to cluster " + memberships.GetLevelIndex(i) + ", distance: " +

distances[i] + "\r\n";

KMeansCluster[i] = memberships.GetLevelIndex(i);

}

}

//

// Regression Analysis

//

Продовження додатку Б

// Multiple linear regression can be performed using

// the LinearRegressionModel class.

//

// This QuickStart sample uses old economic data about 50 countries

// from Belsley, Kuh and Welsch. The fields are as follows:

// DispInc: Per capita disposable income.

// Growth: Percent rate of change of DispInc.

// Pop15: Percentage of population under 15.

// Pop75: Percentage of population over 75.

// Savings: Aggregate savings divided by disposable income.

//

// We want to investigate the effect of the first four variables // on the savings ratio.

//DataTable dataTable = new DataTable("savings");

//object[] rowData = new object[data.Columns.Count];

//data.Rows.Add(rowData);

//DataTable dataTable;

//dataTable = ReadData();

// Next, create a VariableCollection from the data table:

//VariableCollection data = new VariableCollection(dataTable);

if (KMethodCheck.Checked || RegressionAllElementsCheck.Checked)

{

X1 = new double[DataTable.Rows.Count - 1];

X2 = new double[DataTable.Rows.Count - 1];

if (!RegressionAllElementsCheck.Checked)

{

List<double> X1L = new List<double>();

List<double> X2L = new List<double>();

for (int i = 0; i < DataTable.Rows.Count - 1; i++)

{

if (KMeansCluster[i] == Int32.Parse(RegressionClusterText.Text))

{

X1L.Add(double.Parse(DataTable[0, i].Value.ToString()));

X2L.Add(double.Parse(DataTable[1, i].Value.ToString()));

}

}

X1 = X1L.ToArray();

X2 = X2L.ToArray();

}

else

{

for (int i = 0; i < DataTable.Rows.Count - 1; i++)

{

X1[i] = double.Parse(DataTable[0, i].Value.ToString());

X2[i] = double.Parse(DataTable[1, i].Value.ToString());

}

}

if (X1.Length < 2)

{

MessageBox.Show("Кластер " + Int32.Parse(RegressionClusterText.Text) + " содержит " + X1.Length + " элемент(ов), необходимо как минимум 2");

} x1 = new NumericalVariable("x1", X1);

x2 = new NumericalVariable("x2", X2);

variables = new NumericalVariable[] { x1, x2 };

collection = new VariableCollection(variables);

// Now create the regression model. Parameters are the name

Продовження додатку Б

// of the dependent variable, a string array containing

// the names of the independent variables, and the VariableCollection

// containing all variables.

// LinearRegressionModel model = new LinearRegressionModel(data,

// "Savings", new string[] { "Pop15", "Pop75", "DispInc", "Growth" });

switch (RegressionMethodList.SelectedIndex)

{

case 0:

{

model = new LinearRegressionModel(collection, "x1", new string[] { "x2" });

(model as LinearRegressionModel).NoIntercept = false;

break;

}

case 1:

{

model = new LogisticRegressionModel(x1, new NumericalVariable[] { x2 });

break;

}

case 2:

{

model = new PolynomialRegressionModel(x1, x2, 2);

(model as PolynomialRegressionModel).NoIntercept = false;

break;

}

//case 3:

//{

// model = new NonlinearRegressionModel(collection, "x1","x2" );

// (model as NonlinearRegressionModel).Curve = new Extreme.Mathematics.Curves.Nonlinear.GaussianCurve();

// break;

//}

}

// We can set model options now, such as whether to include a constant:

// The Compute method performs the actual regression analysis.

model.Compute();

// The Parameters collection contains information about the regression

// parameters.

//RegressionAnalysisText.Text += "Variable Value Std.Error t-stat p-Value";

foreach (Parameter parameter in model.Parameters)

{

string[] Params = new string[5];

Params[0] = parameter.Name;

Params[1] = parameter.Value.ToString();

Params[2] = parameter.StandardError.ToString();

Params[3] = parameter.Statistic.ToString();

Params[4] = parameter.PValue.ToString();

RegressionTable.Rows.Add(Params);

// Parameter objects have the following properties:

// Name, usually the name of the variable:

// Estimated value of the parameter:

// Standard error:

// The value of the t statistic for the hypothesis that the parameter is zero.

// Probability corresponding to the t statistic.

//RegressionAnalysisText.Text += parameter.Name + parameter.Value + parameter.StandardError + parameter.Statistic + parameter.PValue + "\r\n";

Продовження додатку Б

}

// In addition to these properties, Parameter objects have a GetConfidenceInterval

// method that returns a confidence interval at a specified confidence level.

// Notice that individual parameters can be accessed using their numeric index.

// Parameter 0 is the intercept, if it was included.

Interval confidenceInterval = model.Parameters[0].GetConfidenceInterval(0.95);

RegressionAnalysisText.Text += "95% confidence interval for Growth: " + confidenceInterval.LowerBound + " - " + confidenceInterval.UpperBound + "\r\n";

// Parameters can also be accessed by name:

//confidenceInterval = model.Parameters["DispInc"].GetConfidenceInterval(0.95);

confidenceInterval = model.Parameters["x2"].GetConfidenceInterval(0.95);

RegressionAnalysisText.Text += "95% confidence interval for Growth: " + confidenceInterval.LowerBound + " - " + confidenceInterval.UpperBound + "\r\n";

// There is also a wealth of information about the analysis available

// through various properties of the LinearRegressionModel object:

RegressionAnalysisText.Text += "Residual standard error: " + model.StandardError + "\r\n";

RegressionAnalysisText.Text += "R-Squared: " + model.RSquared + "\r\n";

RegressionAnalysisText.Text += "Adjusted R-Squared: " + model.AdjustedRSquared + "\r\n";

RegressionAnalysisText.Text += "F-statistic: " + model.FStatistic + "\r\n";

RegressionAnalysisText.Text += "Corresponding p-value: " + model.PValue + "\r\n";

// Much of this data can be summarized in the form of an ANOVA table:

foreach (AnovaRow row in model.AnovaTable.Rows)

{

string[] Params = new string[5];

Params[0] = row.SumOfSquares.ToString();

Params[1] = row.DegreesOfFreedom.ToString();

Params[2] = row.MeanSquare.ToString();

if (row is AnovaModelRow)

{

Params[3] = (row as AnovaModelRow).FStatistic.ToString();

Params[4] = (row as AnovaModelRow).PValue.ToString();

}

RegressionTable2.Rows.Add(Params);

}

//RegressionAnalysisText.Text += model.AnovaTable.ToString() + "\r\n";

}

RenderGraphic();

}

catch (Exception ex)

{ }

}

private void LinkageMethodList_SelectedIndexChanged(object sender, EventArgs e)

{

if (LinkageMethodList.SelectedIndex == 0)

{

linkageMethod = LinkageMethod.Average;

}

else if (LinkageMethodList.SelectedIndex == 1)

{

linkageMethod = LinkageMethod.Centroid;

}

else if (LinkageMethodList.SelectedIndex == 2)

{

linkageMethod = LinkageMethod.Complete;

}

Продовження додатку Б

else if (LinkageMethodList.SelectedIndex == 3)

{

linkageMethod = LinkageMethod.Median;

}

else if (LinkageMethodList.SelectedIndex == 4)

{

linkageMethod = LinkageMethod.Single;

}

else if (LinkageMethodList.SelectedIndex == 5)

{

linkageMethod = LinkageMethod.Ward;

}

PerfornAnalysis();

}

private void DistanceList_SelectedIndexChanged(object sender, EventArgs e)

{

if (DistanceList.SelectedIndex == 0)

{

distanceMeasures = DistanceMeasures.CanberraDistance;

}

else if (DistanceList.SelectedIndex == 1)

{

distanceMeasures = DistanceMeasures.CorrelationDistance;

}

else if (DistanceList.SelectedIndex == 2)

{

distanceMeasures = DistanceMeasures.EuclidianDistance;

}

else if (DistanceList.SelectedIndex == 3)

{

distanceMeasures = DistanceMeasures.ManhattanDistance;

}

else if (DistanceList.SelectedIndex == 4)

{

distanceMeasures = DistanceMeasures.MaximumDistance;

}

else if (DistanceList.SelectedIndex == 5)

{

distanceMeasures = DistanceMeasures.SquaredEuclidianDistance;

}

PerfornAnalysis();

}

private void IerarhicalMethodCheck_CheckedChanged(object sender, EventArgs e)

{

PerfornAnalysis();

}

private void KMethodCheck_CheckedChanged(object sender, EventArgs e)

{

PerfornAnalysis();

}

private void ClustersCountText_TextChanged(object sender, EventArgs e)

{

ClustersCount = Int32.Parse(ClustersCountText.Text);

PerfornAnalysis();

}

Продовження додатку Б

private void RenderGraphic()

{

Graphics g = GraphicBuffer.Graphics;

g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias;

g.FillRectangle(new SolidBrush(Color.White), GraphicPanel.DisplayRectangle);

if (DataTable.Rows.Count > 1)

{

double[] X1 = new double[DataTable.Rows.Count - 1];

double[] X2 = new double[DataTable.Rows.Count - 1];

for (int i = 0; i < DataTable.Rows.Count - 1; i++)

{

X1[i] = double.Parse(DataTable[0, i].Value.ToString());

X2[i] = double.Parse(DataTable[1, i].Value.ToString());

}

float Radius = 10.0f;

double Border = 40;

double GraphicSize = Math.Min(GraphicPanel.Size.Height, GraphicPanel.Size.Width) - Border * 2;

double MinX1 = X1.Min();

double MaxX1 = X1.Max();

double dX1 = MaxX1 - MinX1;

double MinX2 = X2.Min();

double MaxX2 = X2.Max();

double dX2 = MaxX2 - MinX2;

double dX = Math.Max(dX1, dX2);

for (int i = 0; i < X1.Length; i++)

{

X1[i] = (MinX1 + Border + (X1[i] - MinX1) / dX * GraphicSize);

X2[i] = (GraphicPanel.Size.Height - (MinX2 + Border + (X2[i] - MinX2) / dX * GraphicSize));

g.FillEllipse(new SolidBrush(Color.FromArgb(0, 0, 0)), (float)X1[i], (float)X2[i], Radius, Radius);

}

g.DrawString(MinX1.ToString(), new Font("Segoe WP Semilight", 10.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)(Border - 5), (float)(GraphicPanel.Size.Height - Border + 10)));

g.DrawString(MinX2.ToString(), new Font("Segoe WP Semilight", 10.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)(Border - 25), (float)(GraphicPanel.Size.Height - Border - 10)));

g.DrawString((MinX1 + dX).ToString(), new Font("Segoe WP Semilight", 10.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)(GraphicPanel.Size.Width - Border), (float)(GraphicPanel.Size.Height - Border + 10)));

g.DrawString((MinX2 + dX).ToString(), new Font("Segoe WP Semilight", 10.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)(Border - 25), (float)(Border - 10)));

if (IerarhicalCluster != null && IerarhicalMethodCheck.Checked)

{

for (int i = 0; i < X1.Length - 1; i++)

{

for (int j = i + 1; j < X1.Length; j++)

{

if (IerarhicalCluster[i] == IerarhicalCluster[j])

g.DrawLine(new Pen(Color.FromArgb(0, 0, 0), 1.0f), (float)X1[i] + Radius / 2.0f, (float)X2[i] + Radius / 2.0f, (float)X1[j] + Radius / 2.0f, (float)X2[j] + Radius / 2.0f);

}

g.DrawString(i + "/" + IerarhicalCluster[i].ToString(), new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)X1[i] + 2 * Radius, (float)X2[i] - 2 * Radius));

}

Продовження додатку Б

g.DrawString(X1.Length - 1 + "/" + IerarhicalCluster[X1.Length - 1].ToString(), new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF((float)X1[X1.Length - 1] + 2 * Radius, (float)X2[X1.Length - 1] - 2 * Radius));

g.DrawString("Иерархичекий", new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 0)), new PointF(200.0f, 10.0f));

}

if (KMeansCluster != null && KMethodCheck.Checked)

{

for (int i = 0; i < X1.Length - 1; i++)

{

for (int j = i + 1; j < X1.Length; j++)

{

if (KMeansCluster[i] == KMeansCluster[j])

g.DrawLine(new Pen(Color.FromArgb(0, 0, 255), 1.0f), (float)X1[i] + Radius / 2.0f, (float)X2[i] + Radius / 2.0f, (float)X1[j] + Radius / 2.0f, (float)X2[j] + Radius / 2.0f);

}

g.DrawString(i + "/" + KMeansCluster[i].ToString(), new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 255)), new PointF((float)X1[i] - 2 * Radius, (float)X2[i] - 2 * Radius));

}

g.DrawString(X1.Length - 1 + "/" + KMeansCluster[X1.Length - 1].ToString(), new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 255)), new PointF((float)X1[X1.Length - 1] - 2 * Radius, (float)X2[X1.Length - 1] - 2 * Radius));

g.DrawString("K-средних", new Font("Segoe WP Semilight", 12.0f, FontStyle.Regular), new SolidBrush(Color.FromArgb(0, 0, 255)), new PointF(110.0f, 10.0f));

}

}

GraphicBuffer.Render();

GraphicBuffer.Render(GraphicPanel.CreateGraphics());

}

private void GraphicPanel_Resize(object sender, EventArgs e)

{

GraphicBuffer = currentContext.Allocate(GraphicPanel.CreateGraphics(), GraphicPanel.DisplayRectangle);

}

//DendrogrammaBuffer = currentContext.Allocate(DendrogrammaPanel.CreateGraphics(), DendrogrammaPanel.DisplayRectangle);

private void GraphicPanel_Paint(object sender, PaintEventArgs e)

{

RenderGraphic();

}

private void DataTable_CellValueChanged(object sender, DataGridViewCellEventArgs e)

{

PerfornAnalysis();

}

private void RegressionClusterText_TextChanged(object sender, EventArgs e)

{

PerfornAnalysis();

}

private void DataTable_RowsRemoved(object sender, DataGridViewRowsRemovedEventArgs e)

{

PerfornAnalysis();

}

private void RegressionMethodList_SelectedIndexChanged(object sender, EventArgs e)

{

Продовження додатку Б

PerfornAnalysis();

}

private void RegressionAllElementsCheck_CheckedChanged(object sender, EventArgs e)

{

PerfornAnalysis();

}

}

}

Додаток В