iris_deal.ipynb 284 KB

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns                #绘制线箱图

#获取需要的数据,鸢尾花数据集在scikit-learn中有提供,并且进行简单的处理
iris_database=load_iris();
#print(iris_database);
data=iris_database["data"];
target=iris_database["target"]
#print(data);
#print("\n");
#print(target);
#print("\n");
print("target=",target);
print("\n");
iris_df = pd.DataFrame(iris_database.data, columns=['Sepal_Length',
                      'Sepal_Width', 'Patal_Length', 'Petal_Width'])
print(iris_df);

# 绘制KED图
iris_df['Target'] = iris_database.target
  
iris_df['Target'].replace([0], 'Iris_Setosa', inplace=True)
iris_df['Target'].replace([1], 'Iris_Vercicolor', inplace=True)
iris_df['Target'].replace([2], 'Iris_Virginica', inplace=True)

sns.kdeplot(iris_df.loc[(iris_df['Target']=='Iris_Setosa'),
            'Sepal_Length'], color='r', fill=True);
  
sns.kdeplot(iris_df.loc[(iris_df['Target']=='Iris_Vercicolor'), 
            'Sepal_Length'], color='b', fill=True);

sns.kdeplot(iris_df.loc[(iris_df['Target']=='Iris_Virginica'),
            'Sepal_Length'], color='r', fill=True)

   
plt.xlabel('Sepal Length')
plt.ylabel('Probability Density')



#绘制原始数据的箱线图和小提琴图
fig, axs = plt.subplots(2, 2)        #总共4列数据
nu=0;

for i in axs:
    i[nu%2].boxplot(data[:,nu]);
    part1=i[nu%2].violinplot(data[:,nu], widths=0.15)
    nu=nu+1
    i[nu%2].boxplot(data[:,nu]);
    i[nu%2].violinplot(data[:,nu], widths=0.15)
    nu=nu+1



plt.tight_layout()
plt.show();



#对原始数据做DBSCAN
#对DBSCAN的原理的理解
#DBSCAN将点分为核心点边界点噪声点
#每一个聚类的簇都是由核心点边界点组成的
#核心点的定义为在这个点的esp邻域内有大于等于min_samples个点
#边界点的定义为非核心点但是在核心点的邻域内
#当min_samples过大的时候部分核心点会变为边界点或者噪声点,导致某些簇消失

basic=DBSCAN(eps=0.4,min_samples=3);
basic.fit(data);
basic_label=basic.labels_;
print("db.labels_(eps=0.4,min_simples=3)=",basic_label);
print("\n");






#主成分分析将数据降维为3维度(4维的图像是无法绘制的)
pca=PCA(n_components=3);
newx=pca.fit_transform(data);
#print(pca);
#print("\n");
#print(newx);
#print(pca.explained_variance_ratio_)
#print("\n");
#绘制PCA的三维点图
fig_0=plt.figure();
ax_0 =plt.axes(projection ='3d');
ax_0.scatter(newx[:,0],newx[:,1],newx[:,2],c=target);
ax_0.set_title("3d Scatter plot for PCA");
plt.show();


#尝试对主成分做DBSCAN
db=DBSCAN(eps=0.5,min_samples=4).fit(newx);
labels=db.labels_;
print("db.labels_(eps=0.5,min_simples=4)=",db.labels_);

#绘制DBSCAN的三维点图

fig_1 = plt.figure();
ax_1 = plt.axes(projection ='3d');
ax_1.scatter(newx[:,0],newx[:,1], newx[:,2],c=labels);
ax_1.set_title('3d Scatter plot for DBSCAN');
plt.show();
target= [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


     Sepal_Length  Sepal_Width  Patal_Length  Petal_Width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
146           6.3          2.5           5.0          1.9
147           6.5          3.0           5.2          2.0
148           6.2          3.4           5.4          2.3
149           5.9          3.0           5.1          1.8

[150 rows x 4 columns]
C:\Users\tyq17\AppData\Local\Temp\ipykernel_22328\4175703781.py:27: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris_df['Target'].replace([0], 'Iris_Setosa', inplace=True)
db.labels_(eps=0.4,min_simples=3)= [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0 -1  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0
  0  0  1  1  1  1  1  1  1  2  1  1  2  1 -1  1 -1  1  1  1 -1  1  3  1
  3  1  1  1  1  1  1  1  1  1  1  3  1  1  1 -1  1  1  1  1  1  2  1  1
  1  1  2  1 -1  3  3  3  3 -1 -1 -1 -1 -1  3  3  3  3 -1  3  3 -1 -1 -1
  3  3 -1  3  3  3  3  3  3  3 -1 -1  3  3 -1 -1  3  3  3  3  3  3  3  3
  3  3  3  3  3  3]


db.labels_(eps=0.5,min_simples=4)= [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0
  0  0  1  1  1  1  1  1  1  2  1  1  2  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  1  1
  1  1  2  1  1  1  1  1  1  1 -1  1 -1 -1  1  1  1  1  1  1  1 -1 -1  1
  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1]