电视节目按类型进行kmeans聚类

在用爬虫爬取了电视节目的类型信息后,要对这些节目依据它们的类型进行kmeans聚类。

拆解任务

  • 读入文件
  • 计算节目类型的TF-IDF
  • 进行kmeans聚类
  • 画图展示聚类效果

计算TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(stop_words='english')                   #stop_words=’english’去掉停用词
prams['genres']=prams['genres'].fillna('')
tfidf_matrix=tfidf.fit_transform(prams['genres']).toarray()       #计算tfidf值

kmeans聚类

from sklearn.cluster import KMeans

mykms=KMeans(n_clusters=3)
mykms.fit_predict(tfidf_matrix)                #进行聚类
k = mykms.labels_                              #获得聚类结果

画图

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2)
decomposition_data = tsne.fit_transform(tfidf_matrix)       # 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时    
x = []
y = []
for i in decomposition_data:
    x.append(i[0])
    y.append(i[1])
fig = plt.figure(figsize=(10, 10))
ax = plt.axes()
ax.set_title('k=3')
plt.scatter(x, y, c=z, marker="x")
plt.xticks(())
plt.yticks(())

完整代码

import pandas as pd
import numpy as np
import xlwt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

prams = pd.read_csv('data.csv', sep=',',encoding='gbk',header='infer',error_bad_lines=False)
tt = np.array(prams)
test = tt.tolist()
wb = xlwt.Workbook()
ws = wb.add_sheet('cluster')

tfidf=TfidfVectorizer(stop_words='english')                   #stop_words=’english’去掉停用词
prams['genres']=prams['genres'].fillna('') 
tfidf_matrix=tfidf.fit_transform(prams['genres']).toarray()       #计算tfidf值

mykms=KMeans(n_clusters=3)
mykms.fit_predict(tfidf_matrix)
k = mykms.labels_

for i in range(0,3):
    label_i=[]
    for j in range(0,len(k)):
        if k[j]==i:
            label_i.append(test[j])
    for m,n in enumerate(label_i):
        ws.write(m, i, n)
        wb.save('t.xls')

tsne = TSNE(n_components=2)
decomposition_data = tsne.fit_transform(tfidf_matrix)       # 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时    
x = []
y = []
for i in decomposition_data:
    x.append(i[0])
    y.append(i[1])
fig = plt.figure(figsize=(10, 10))
ax = plt.axes()
ax.set_title('k=3')
plt.scatter(x, y, c=z, marker="x")
plt.xticks(())
plt.yticks(())
plt.savefig('./y3.png', aspect=1)