import jieba
import jieba.analyse
import math
import operator
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from collections import Counter
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
#np.set_printoptions(threshold=np.inf)
#加载手工设置的某些词的idf值
def load_idf_file(path):
idf_dict = {}
handle = open(path, 'r',encoding= 'utf8')
line = handle.readline() # 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
line_arr = line.split(' ')
idf_dict[line_arr[0]] = float(line_arr[1])
line = handle.readline()
handle.close()
return idf_dict
def cal_idf(data_set,idf_dict):
doc_num = len(data_set)
#word_doc_count = {}
word_doc_count=defaultdict(int)
for word_str in data_set:
word_list = word_str.split(' ')
word_list = list(set(word_list))
for item in word_list:
if item and item.strip()!='':
word_doc_count[item]+=1
word_idf = {}
default_idf_keys = idf_dict.keys()
for k,v in word_doc_count.items():
idf = math.log(doc_num*1.0 / v)
if k in default_idf_keys: word_idf[k] = idf_dict[k]
else:word_idf[k] = idf
#path = "idf.txt"
#save(word_idf, path)
return word_idf
def cal_tfidf(data_set,idf_ret):
doc_word_tfidf = []
i = 0
for word_str in data_set:
word_list = word_str.split(' ')
doc_word_total = len(word_list)
doc_word_dict = defaultdict(int)
doc_word_tfidf_dict = defaultdict(int)
for item in word_list:
if item and item.strip()!='':
doc_word_dict[item]+=1
for k,v in doc_word_dict.items():
#exit()
doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]
#print(doc_word_tfidf_dict)
#exit()
doc_word_tfidf.append(doc_word_tfidf_dict)
i=i+1
#print(doc_word_tfidf)
return doc_word_tfidf
def save(idf_dict, path):
f = open(path, 'a+',encoding= 'utf8')
f.truncate()
for key in idf_dict.keys():
f.write(str(key) + " " + str(idf_dict[key]) + "\n")
f.close()
# 切词
def jieba_tokenize():
jieba_need =[]
for item in datas:
#item = u'数据挖掘工程师'
#temp_list1=jieba.cut_for_search(item)
#'''
temp_list1 = jieba.analyse.extract_tags(item,topK=10,allowPOS=('an', 'i', 'j', 'l', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 's', 'vn', 'un','v')) #
w_len = len(temp_list1)
#print(temp_list1)
if w_len>2:
w_num = math.ceil(float(w_len)*0.6)
temp_list1 = temp_list1[0:w_num]
#'''
#temp_list1=jieba.cut(item)
#print(temp_list1)
temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]
#print(temp_list)
#exit()
jieba_need.append(" ".join(temp_list))
return jieba_need
# 确定降维方法
def reduction(matrix,lsa=True,pca=False):
if lsa:
#tfidf_matrix = vectorizer.fit_transform(datas)# 没有切词
svd = TruncatedSVD(50)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_ = lsa.fit_transform(matrix)
if pca:
train_vector = (matrix)
pca = PCA(n_components=50)
X_ = pca.fit_transform(train_vector)
return X_
jieba.analyse.set_idf_path("./jieba/idf.txt")
data_num = 10000
k_num = 200
i = 0
output = open('../jobname3.txt', 'r',encoding= 'utf8')
line = output.readline() # 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
i=i+1
datas.append(line)
if i>=data_num:break
line = output.readline()
output.close()
npyfile = "data.npy"
if os.path.exists(npyfile):
X = np.load("data.npy")
#print(X)
else:
# 读取停用词
with open('./stop_words.txt',encoding='utf-8') as f:
entities = list(f)
lines = []
for line in entities:
line1 = line.strip()
lines.append(line1)
#加载手工设置的idf
default_idf_dict = load_idf_file("./jieba/idf.txt")
#default_idf_dict = {}
my_train = jieba_tokenize()
idf_ret = cal_idf(my_train,default_idf_dict)
tfidf_ret = cal_tfidf(my_train,idf_ret)
doc_rows = len(datas)
word_rows = len(idf_ret)
#把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]
X = np.zeros([doc_rows,word_rows])
for i in range(doc_rows):
j = 0
for k,v in idf_ret.items():
#print(i,k,v)
#print(tfidf_ret[i][k])
X[i][j] = tfidf_ret[i][k]
j=j+1
X = np.array(X)
#print(X)
#np.save("data.npy",X)
X = reduction(X,lsa=True,pca=False)
#X = tdm #np.array(tdm)
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)
cl = DBSCAN(eps=0.2, min_samples=30)
#cl = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.8, n_samples=400), bin_seeding=True)
#cl = AffinityPropagation()
#cl.fit(X)
#result = cl.predict(X)
result = cl.fit_predict(X)
num_clusters = len(set(result))
# 结果输出
ret = [[] for y in range(len(result))]
for i in range(len(datas)):
classid = result[i]
ret[classid].append(datas[i])
for m in range(num_clusters):
file = "./result/result_"+str(m)+".txt"
handle = open(file, 'w+',encoding= 'utf8')
for n in range(len(ret[m])):
handle.write(ret[m][n]+"\n")
handle.close()
plt.plot(list(range(data_num)), list(result), '.r')
plt.show()
exit()
|
# coding=utf-8
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import jieba
import pickle
import zlib
__author__ = 'rockychi1001@gmail.com'
def jieba_tokenizer(x): return jieba.cut(x)
def load_info(fileName):
#print(fileName)
BLOCK_SIZE = 1024
data = ''
with open(fileName) as file:
for line in file:
data+=line
#data = open(fileName, 'r').read()
#data = zlib.decompress(data)
#data = zlib.decompress(data)
#data = zlib.decompressobj().decompress(data, zlib.MAX_WBITS)
#data = pickle.loads(data)
#self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data
return data
def save_info(fileName,data):
#data = pickle.dumps((self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1), -1)
#data = zlib.compress(data)
open(fileName, 'w').write(data)
# 加载训练文本(网易新闻抓取的分类数据),并切分为训练集
def train_data(training_data):
x_train, _, y_train, _ = train_test_split(training_data.data, training_data.target, test_size=0.00000001)
# 生成词语的Tfidf向量空间模型,注意,训练样本数据调用的是fit_transform接口
words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer)
#print words_tfidf_vec
#exit()
X_train = words_tfidf_vec.fit_transform(x_train)
# 训练分类器
clf = LinearSVC().fit(X_train, y_train)
#data = pickle.dumps((clf,words_tfidf_vec), -1)
#print data
save_info("train1.clf",pickle.dumps(clf))
save_info("train1.vec",pickle.dumps(words_tfidf_vec))
#return words_tfidf_vec
# 加载待预测文本数据,并切分为测试集
def test_data():
clf = pickle.loads(load_info("train1.clf"))
words_tfidf_vec = pickle.loads(load_info("train1.vec"))
#data = pickle.loads(data)
#clf,words_tfidf_vec = data
testing_data = load_files(ur'D:\work\pytest\text_classifier\predict_data', encoding='utf-8')
_, x_test, _, _ = train_test_split(testing_data.data, testing_data.target, test_size=0.99999999)
# 测试样本数据调用的是transform接口
X_test = words_tfidf_vec.transform(x_test)
# 进行预测
pred = clf.predict(X_test)
return pred
training_data = load_files(ur'D:\work\pytest\text_classifier\netease', encoding='utf-8')
#train_data(training_data)
#print 'finish!'
#exit()
pred = test_data()
for label in pred:
print u'predict label: %s ' % training_data.target_names[pred]
|
#!/usr/bin/env python
import os
import sys
import pickle
import zlib
import cv2
import numpy as np
import urllib2
import time
import random
from Queue import Queue
from threading import Thread
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
FILE_LOAD_THREADS = 1
FILE_SEED = 24
CLUSTER_SEED = 24
CLUSTERS_NUMBER = 1000
BAYES_ALPHA = 0.1
ADA_BOOST_ESTIMATORS = 110
VERBOSE = True
USE_CACHE = True
_g_removed = False
class PCR:
def __init__(self):
self.__clustersNumber = CLUSTERS_NUMBER
self.__queue = Queue()
self.__verbose = VERBOSE
self.__useCache = USE_CACHE
for i in range(FILE_LOAD_THREADS):
t = Thread(target = self.__worker)
t.daemon = True
t.start()
self.__kmeans = MiniBatchKMeans(n_clusters = self.__clustersNumber, random_state = CLUSTER_SEED, verbose = self.__verbose)
self.__tfidf = TfidfTransformer()
self.__tfidf1 = TfidfTransformer()
self.__clf = AdaBoostClassifier(MultinomialNB(alpha = BAYES_ALPHA), n_estimators = ADA_BOOST_ESTIMATORS)
self.__clf1 = AdaBoostClassifier(MultinomialNB(alpha = BAYES_ALPHA), n_estimators = ADA_BOOST_ESTIMATORS)
def __worker(self):
while True:
task = self.__queue.get()
func, args = task
try:
func(args)
except Exception as e:
print 'EXCEPTION:', e
self.__queue.task_done()
def train(self, positiveFiles, negativeFiles):
cachedData = self.__loadCache()
if cachedData is None:
self.__log('loading positives')
positiveSamples = self.__loadSamples(positiveFiles)
self.__log('loading negatives')
negativeSamples = self.__loadSamples(negativeFiles)
totalDescriptors = []
self.__addDescriptors(totalDescriptors, positiveSamples)
self.__addDescriptors(totalDescriptors, negativeSamples)
self.__kmeans.fit(totalDescriptors)
clusters = self.__kmeans.predict(totalDescriptors)
self.__printDistribution(clusters)
self.__saveCache((positiveSamples, negativeSamples, self.__kmeans, clusters))
else:
self.__log('using cache')
positiveSamples, negativeSamples, self.__kmeans, clusters = cachedData
totalSamplesNumber = len(negativeSamples) + len(positiveSamples)
counts = lil_matrix((totalSamplesNumber, self.__clustersNumber))
counts1 = lil_matrix((totalSamplesNumber, 256))
self.__currentSample = 0
self.__currentDescr = 0
self.__calculteCounts(positiveSamples, counts, counts1, clusters)
self.__calculteCounts(negativeSamples, counts, counts1, clusters)
counts = csr_matrix(counts)
counts1 = csr_matrix(counts1)
self.__log('training bayes classifier')
tfidf = self.__tfidf.fit_transform(counts)
tfidf1 = self.__tfidf1.fit_transform(counts1)
classes = [True] * len(positiveSamples) + [False] * len(negativeSamples)
self.__clf.fit(tfidf, classes)
self.__clf1.fit(tfidf1, classes)
self.__log('training complete')
def predict(self, files):
self.__log('loading files')
samples = self.__loadSamples(files)
totalDescriptors = []
self.__addDescriptors(totalDescriptors, samples)
self.__log('predicting classes')
clusters = self.__kmeans.predict(totalDescriptors)
counts = lil_matrix((len(samples), self.__clustersNumber))
counts1 = lil_matrix((len(samples), 256))
self.__currentSample = 0
self.__currentDescr = 0
self.__calculteCounts(samples, counts, counts1, clusters)
counts = csr_matrix(counts)
counts1 = csr_matrix(counts1)
tfidf = self.__tfidf.transform(counts)
tfidf1 = self.__tfidf1.transform(counts1)
self.__log('classifying')
weights = self.__clf.predict_log_proba(tfidf.toarray())
weights1 = self.__clf1.predict_log_proba(tfidf1.toarray())
predictions = []
for i in xrange(0, len(weights)):
w = weights[i][0] - weights[i][1]
w1 = weights1[i][0] - weights1[i][1]
pred = w < 0
pred1 = w1 < 0
if pred != pred1:
pred = w + w1 < 0
predictions.append(pred)
self.__log('prediction complete')
return predictions
def saveModel(self, fileName):
data = pickle.dumps((self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1), -1)
data = zlib.compress(data)
open(fileName, 'w').write(data)
def loadModel(self, fileName):
#print(fileName)
data = open(fileName, 'r').read()
#data = zlib.decompress(data)
data = zlib.decompress(data)
#data = zlib.decompressobj().decompress(data, zlib.MAX_WBITS)
data = pickle.loads(data)
self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data
def __log(self, message):
if self.__verbose:
print message
def __saveCache(self, data):
if not self.__useCache:
return
data = pickle.dumps(data, -1)
data = zlib.compress(data)
open('cache.bin', 'w').write(data)
def __loadCache(self):
if not self.__useCache:
return None
if not os.path.isfile('cache.bin'):
return None
data = open('cache.bin', 'r').read()
data = zlib.decompress(data)
data = pickle.loads(data)
return data
def __calculteCounts(self, samples, counts, counts1, clusters):
cn = self.__clustersNumber
for s in samples:
currentCounts = {}
for d in s[0]:
currentCounts[clusters[self.__currentDescr]] = currentCounts.get(clusters[self.__currentDescr], 0) + 1
self.__currentDescr += 1
for clu, cnt in currentCounts.iteritems():
counts[self.__currentSample, clu] = cnt
for i, histCnt in enumerate(s[1]):
counts1[self.__currentSample, i] = histCnt[0]
self.__currentSample += 1
def __printDistribution(self, clusters):
if not self.__verbose:
return
distr = {}
for c in clusters:
distr[c] = distr.get(c, 0) + 1
v = sorted(distr.values(), reverse=True)
print 'distribution:', v[0:15], '...', v[-15:]
def __addDescriptors(self, totalDescriptors, samples):
for sample in samples:
for descriptor in sample[0]:
totalDescriptors.append(descriptor)
def __loadSamples(self, files):
samples = [[]] * len(files)
n = 0
for f in files:
self.__queue.put((self.__loadSingleSample, (f, samples, n)))
n += 1
self.__queue.join()
if _g_removed:
print ' === REMOVED = TERMINATE'
sys.exit(44)
return samples
def __loadSingleSample(self, args):
global _g_removed
fileName, samples, sampleNum = args
des, hist = self.__getFeatures(fileName)
if des is None:
print 'ERROR: failed to load', fileName
os.remove(fileName)
_g_removed = True
#sys.exit(44)
des = []
hist = [[0]] * 256
samples[sampleNum] = (des, hist)
def __getFeatures(self, fileName):
fid = 'cache/' + str(zlib.crc32(fileName))
self.__log('loading %s' % fileName)
if os.path.isfile(fid):
des, hist = pickle.loads(open(fid, 'rb').read())
else:
img = cv2.imread(fileName)
if img.shape[1] > 1000:
cf = 1000.0 / img.shape[1]
newSize = (int(cf * img.shape[0]), int(cf * img.shape[1]), img.shape[2])
img.resize(newSize)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
s = cv2.SIFT(nfeatures = 400)
d = cv2.DescriptorExtractor_create("OpponentSIFT")
kp = s.detect(gray, None)
kp, des = d.compute(img, kp)
hist = self.__getColorHist(img)
#open(fid, 'wb').write(pickle.dumps((des, hist), -1))
return des, hist
def __getColorHist(self, img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
dist = cv2.calcHist([hsv],[0],None,[256],[0,256])
return dist
def loadDir(dirName):
files = os.listdir(dirName)
fnames = []
for f in files:
if not f.endswith('.jpg'):
continue
fileName = dirName + '/' + f
fnames.append(fileName)
return fnames
def loadFileLists():
random.seed(FILE_SEED)
positiveFiles = sorted(loadDir('2'))
negativeFiles = sorted(loadDir('1'))
random.shuffle(positiveFiles)
random.shuffle(negativeFiles)
minLen = min(len(positiveFiles), len(negativeFiles))
p20 = int(0.2 * minLen)
testFiles = positiveFiles[:p20] + negativeFiles[:p20]
positiveFiles = positiveFiles[p20:]
negativeFiles = negativeFiles[p20:]
print testFiles[0], negativeFiles[0], positiveFiles[0]
testFiles = loadDir('1test')
return positiveFiles, negativeFiles, testFiles
def train():
positiveFiles, negativeFiles, testFiles = loadFileLists()
pcr = PCR()
pcr.train(positiveFiles, negativeFiles)
pcr.saveModel('model.bin')
def predict():
positiveFiles, negativeFiles, testFiles = loadFileLists()
testFiles = testFiles
pcr = PCR()
pcr.loadModel('model.bin')
pred = pcr.predict(testFiles)
total = 0
correct = 0
for i in xrange(0, len(testFiles)):
isCorrect = ((testFiles[i][0] == '1' and not pred[i]) or (testFiles[i][0] == '2' and pred[i]))
print isCorrect, pred[i], testFiles[i]
#if not isCorrect:
# print testFiles[i]
correct += int(isCorrect)
total += 1
print 'sum: \t', float(correct) / total
def predictTest(file):
files = [file]
pcr = PCR()
pcr.loadModel('model.bin')
pred = pcr.predict(files)
print '\n\n ===', pred[0], '===\n\n'
def predictUrl(url):
filename = 'test.jpg'
f = open(filename,'wb')
f.write(urllib2.urlopen(url).read())
f.close()
time.sleep(0.5)
predictTest(filename)
def printUsage():
print 'Usage: '
print ' %s train - train model' % sys.argv[0]
print ' %s url http://sample.com/img.jpg - check given url' % sys.argv[0]
sys.exit(42)
predictTest('./img/')#0_18.jpg
'''
if __name__ == '__main__':
if len(sys.argv) < 2:
printUsage()
mode = sys.argv[1]
if mode == 'train':
train()
time.sleep(0.5)
predict()
elif mode == 'url':
if len(sys.argv) < 3:
printUsage()
url = sys.argv[2]
predictUrl(url)
else:
printUsage()
'''
|