`
strayly
  • 浏览: 93637 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论
收藏列表
标题 标签 来源
cluster
import jieba
import jieba.analyse
import math
import operator
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from collections import Counter
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
#np.set_printoptions(threshold=np.inf)

#加载手工设置的某些词的idf值
def load_idf_file(path):
    idf_dict = {}
    handle = open(path, 'r',encoding= 'utf8')
    line = handle.readline()               # 调用文件的 readline()方法 
    datas = []
    while line: 
        line = line.strip() 
        if len(line)>0:
            line_arr = line.split(' ')
            idf_dict[line_arr[0]] = float(line_arr[1])
        line = handle.readline() 
    handle.close()
    return idf_dict

def cal_idf(data_set,idf_dict):
    doc_num = len(data_set)
    #word_doc_count = {}
    word_doc_count=defaultdict(int)
    for word_str in data_set:
        word_list = word_str.split(' ')
        word_list = list(set(word_list))
        for item in word_list:
            if item and item.strip()!='':
                word_doc_count[item]+=1

    word_idf = {}
    default_idf_keys = idf_dict.keys()
    for k,v in word_doc_count.items():  
        idf = math.log(doc_num*1.0 / v)  
        if k in default_idf_keys: word_idf[k] = idf_dict[k]
        else:word_idf[k] = idf
    #path = "idf.txt"  
    #save(word_idf, path)  
    return word_idf
def cal_tfidf(data_set,idf_ret):
    doc_word_tfidf = []
    
    i = 0
    for word_str in data_set:
        word_list = word_str.split(' ')
        doc_word_total = len(word_list)
        doc_word_dict = defaultdict(int)
        doc_word_tfidf_dict = defaultdict(int)
        for item in word_list:
            if item and item.strip()!='':
                doc_word_dict[item]+=1
        for k,v in doc_word_dict.items():
            #exit()
            doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]
        #print(doc_word_tfidf_dict)
        #exit()
        doc_word_tfidf.append(doc_word_tfidf_dict)
        i=i+1
    #print(doc_word_tfidf)
    return doc_word_tfidf    


def save(idf_dict, path):  
    f = open(path, 'a+',encoding= 'utf8')
    f.truncate()  
    for key in idf_dict.keys():  
        f.write(str(key) + " " + str(idf_dict[key]) + "\n")  
    f.close()  

# 切词
def jieba_tokenize():
    jieba_need =[]
    for item in datas:
        #item = u'数据挖掘工程师'
        
        #temp_list1=jieba.cut_for_search(item)
        #'''
        temp_list1 = jieba.analyse.extract_tags(item,topK=10,allowPOS=('an', 'i', 'j', 'l', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 's', 'vn', 'un','v')) #
        w_len = len(temp_list1)
        #print(temp_list1)
        if w_len>2:
            w_num = math.ceil(float(w_len)*0.6)
            temp_list1 = temp_list1[0:w_num]
        #'''
        #temp_list1=jieba.cut(item)
        #print(temp_list1)
        temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]
        #print(temp_list)
        #exit()
        jieba_need.append(" ".join(temp_list))
    return jieba_need


# 确定降维方法
def reduction(matrix,lsa=True,pca=False):
    if lsa:
        #tfidf_matrix = vectorizer.fit_transform(datas)# 没有切词
        svd = TruncatedSVD(50)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X_ = lsa.fit_transform(matrix)
    if pca:
        train_vector = (matrix)
        pca = PCA(n_components=50)
        X_ = pca.fit_transform(train_vector)
    return X_



jieba.analyse.set_idf_path("./jieba/idf.txt")
data_num = 10000
k_num = 200

i = 0
output = open('../jobname3.txt', 'r',encoding= 'utf8')
line = output.readline()               # 调用文件的 readline()方法 
datas = []
while line: 
    line = line.strip() 
    if len(line)>0:
        i=i+1
        datas.append(line)
        if i>=data_num:break    
    line = output.readline() 
output.close()


npyfile = "data.npy"
if os.path.exists(npyfile):
    X = np.load("data.npy")
    #print(X)
else:
    # 读取停用词
    with open('./stop_words.txt',encoding='utf-8') as f:
        entities = list(f)
        lines = []
        for line in entities:
            line1 = line.strip()
            lines.append(line1)
    #加载手工设置的idf
    default_idf_dict = load_idf_file("./jieba/idf.txt")
    #default_idf_dict = {}
    my_train = jieba_tokenize()


    idf_ret = cal_idf(my_train,default_idf_dict)
    tfidf_ret = cal_tfidf(my_train,idf_ret)
    doc_rows = len(datas)
    word_rows = len(idf_ret)
    #把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]
    X = np.zeros([doc_rows,word_rows])
    for i in range(doc_rows):
        j = 0
        for k,v in idf_ret.items():
            #print(i,k,v)
            #print(tfidf_ret[i][k])
            X[i][j] = tfidf_ret[i][k]
            j=j+1
    X = np.array(X)
    #print(X)
    #np.save("data.npy",X)
X = reduction(X,lsa=True,pca=False)
#X = tdm #np.array(tdm)
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)
cl = DBSCAN(eps=0.2, min_samples=30)
#cl = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.8, n_samples=400), bin_seeding=True)
#cl = AffinityPropagation()
#cl.fit(X)
#result = cl.predict(X)
result = cl.fit_predict(X)

num_clusters = len(set(result))




# 结果输出
ret = [[] for y in range(len(result))] 
for i in range(len(datas)): 
    classid = result[i]
    ret[classid].append(datas[i])

for m in range(num_clusters):
    file = "./result/result_"+str(m)+".txt"
    handle = open(file, 'w+',encoding= 'utf8')
    for n in range(len(ret[m])):
        handle.write(ret[m][n]+"\n")
    handle.close()

plt.plot(list(range(data_num)), list(result), '.r')
plt.show()

exit()
PHP TF-IDF与余弦
<?php
function similarity(array $vec1, array $vec2) {
  return dotProduct($vec1, $vec2) / (absVector($vec1) * absVector($vec2));
}
 
function dotProduct(array $vec1, array $vec2) {
  $result = 0;
  foreach (array_keys($vec1) as $key1) {
    foreach (array_keys($vec2) as $key2) {
      if ($key1 === $key2) $result += $vec1[$key1] * $vec2[$key2];
    }
  }
  return $result;
}
 
function absVector(array $vec) {
  $result = 0;
  foreach (array_values($vec) as $value) {
    $result += $value * $value;
  }
  return sqrt($result);
}
 
//文章词频向量
$v1 = array('我们' => 5, '设计' => 2,  '一个' => 1, '算法' =>0, '任意' => 0, '相似' => 1);
$v2 = array('我们' => 5, '设计' => 0,  '一个' => 3, '算法' =>0, '任意' => 0, '相似' => 1);
//计算相似度,值越大相似程度越高
$result1 = similarity($v1,$v2);
var_dump($result1);
test
# coding=utf-8
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import jieba
import pickle
import zlib
__author__ = 'rockychi1001@gmail.com'


def jieba_tokenizer(x): return jieba.cut(x)
def load_info(fileName):
    #print(fileName)
    BLOCK_SIZE = 1024
    data = ''
    with open(fileName) as file:
        for line in file:
            data+=line
    #data = open(fileName, 'r').read()
    #data = zlib.decompress(data)
    #data = zlib.decompress(data)
    #data = zlib.decompressobj().decompress(data, zlib.MAX_WBITS)
    #data = pickle.loads(data)
    #self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data
    return data
def save_info(fileName,data):
    #data = pickle.dumps((self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1), -1)
    #data = zlib.compress(data)
    open(fileName, 'w').write(data)    

# 加载训练文本(网易新闻抓取的分类数据),并切分为训练集
def train_data(training_data):
    
    x_train, _, y_train, _ = train_test_split(training_data.data, training_data.target, test_size=0.00000001)
    # 生成词语的Tfidf向量空间模型,注意,训练样本数据调用的是fit_transform接口
    words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer)
    #print words_tfidf_vec
    #exit()
    X_train = words_tfidf_vec.fit_transform(x_train)
    # 训练分类器
    clf = LinearSVC().fit(X_train, y_train)
    #data = pickle.dumps((clf,words_tfidf_vec), -1)
    #print data
    save_info("train1.clf",pickle.dumps(clf)) 

    save_info("train1.vec",pickle.dumps(words_tfidf_vec)) 

    #return words_tfidf_vec

# 加载待预测文本数据,并切分为测试集
def test_data():
    clf = pickle.loads(load_info("train1.clf"))
    words_tfidf_vec = pickle.loads(load_info("train1.vec"))

    #data = pickle.loads(data)
    #clf,words_tfidf_vec = data
    testing_data = load_files(ur'D:\work\pytest\text_classifier\predict_data', encoding='utf-8')
    _, x_test, _, _ = train_test_split(testing_data.data, testing_data.target, test_size=0.99999999)
    # 测试样本数据调用的是transform接口
    X_test = words_tfidf_vec.transform(x_test)
    # 进行预测
    pred = clf.predict(X_test)
    return pred

training_data = load_files(ur'D:\work\pytest\text_classifier\netease', encoding='utf-8')
#train_data(training_data)
#print 'finish!'
#exit()
pred = test_data()
for label in pred:
    print u'predict label: %s ' % training_data.target_names[pred]
PornDetector
#!/usr/bin/env python 

import os
import sys
import pickle
import zlib
import cv2
import numpy as np
import urllib2
import time
import random
from Queue import Queue
from threading import Thread
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier

FILE_LOAD_THREADS = 1
FILE_SEED = 24
CLUSTER_SEED = 24
CLUSTERS_NUMBER = 1000
BAYES_ALPHA = 0.1
ADA_BOOST_ESTIMATORS = 110
VERBOSE = True
USE_CACHE = True

_g_removed = False

class PCR:
  
    def __init__(self):
        self.__clustersNumber = CLUSTERS_NUMBER
        self.__queue = Queue()
        self.__verbose = VERBOSE
        self.__useCache = USE_CACHE

        for i in range(FILE_LOAD_THREADS):
            t = Thread(target = self.__worker)
            t.daemon = True
            t.start()
    
        self.__kmeans = MiniBatchKMeans(n_clusters = self.__clustersNumber, random_state = CLUSTER_SEED, verbose = self.__verbose)
        self.__tfidf = TfidfTransformer()
        self.__tfidf1 = TfidfTransformer()
    
        self.__clf = AdaBoostClassifier(MultinomialNB(alpha = BAYES_ALPHA), n_estimators = ADA_BOOST_ESTIMATORS)
        self.__clf1 = AdaBoostClassifier(MultinomialNB(alpha = BAYES_ALPHA), n_estimators = ADA_BOOST_ESTIMATORS)
    
    def __worker(self):
        while True:
            task = self.__queue.get()
            func, args = task
            try:
                func(args)
            except Exception as e:
                print 'EXCEPTION:', e
            self.__queue.task_done()
  
    def train(self, positiveFiles, negativeFiles):
        cachedData = self.__loadCache()
        if cachedData is None:
            self.__log('loading positives')
            positiveSamples = self.__loadSamples(positiveFiles)
            self.__log('loading negatives')
            negativeSamples = self.__loadSamples(negativeFiles)
        
            totalDescriptors = []
            self.__addDescriptors(totalDescriptors, positiveSamples)
            self.__addDescriptors(totalDescriptors, negativeSamples)
          
            self.__kmeans.fit(totalDescriptors)
            clusters = self.__kmeans.predict(totalDescriptors)
    
            self.__printDistribution(clusters)
            self.__saveCache((positiveSamples, negativeSamples, self.__kmeans, clusters))
        else:
            self.__log('using cache')
            positiveSamples, negativeSamples, self.__kmeans, clusters = cachedData
    
        totalSamplesNumber = len(negativeSamples) + len(positiveSamples)
        counts = lil_matrix((totalSamplesNumber, self.__clustersNumber))
        counts1 = lil_matrix((totalSamplesNumber, 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(positiveSamples, counts, counts1, clusters)
        self.__calculteCounts(negativeSamples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)
    
        self.__log('training bayes classifier')
        tfidf = self.__tfidf.fit_transform(counts)
        tfidf1 = self.__tfidf1.fit_transform(counts1)
        classes = [True] * len(positiveSamples) + [False] * len(negativeSamples)
        self.__clf.fit(tfidf, classes)
        self.__clf1.fit(tfidf1, classes)
    
        self.__log('training complete')
  
    def predict(self, files):
        self.__log('loading files')
        samples = self.__loadSamples(files)
        totalDescriptors = []
        self.__addDescriptors(totalDescriptors, samples)
        self.__log('predicting classes')
        clusters = self.__kmeans.predict(totalDescriptors)
        counts = lil_matrix((len(samples), self.__clustersNumber))
        counts1 = lil_matrix((len(samples), 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(samples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)
    
        tfidf = self.__tfidf.transform(counts)
        tfidf1 = self.__tfidf1.transform(counts1)
    
        self.__log('classifying')
    
        weights = self.__clf.predict_log_proba(tfidf.toarray())
        weights1 = self.__clf1.predict_log_proba(tfidf1.toarray())
        predictions = []
        for i in xrange(0, len(weights)):
            w = weights[i][0] - weights[i][1]
            w1 = weights1[i][0] - weights1[i][1]

            pred = w < 0
            pred1 = w1 < 0
      
            if pred != pred1:
                pred = w + w1 < 0
      
                predictions.append(pred)
    
        self.__log('prediction complete')
        return predictions
  
    def saveModel(self, fileName):
        data = pickle.dumps((self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1), -1)
        data = zlib.compress(data)
        open(fileName, 'w').write(data)

    def loadModel(self, fileName):
        #print(fileName)
        data = open(fileName, 'r').read()
        #data = zlib.decompress(data)
        data = zlib.decompress(data)
        #data = zlib.decompressobj().decompress(data, zlib.MAX_WBITS)
        data = pickle.loads(data)
        self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data
    
    def __log(self, message):
        if self.__verbose:
            print message
    
    def __saveCache(self, data):
        if not self.__useCache:
            return
        data = pickle.dumps(data, -1)
        data = zlib.compress(data)
        open('cache.bin', 'w').write(data)
  
    def __loadCache(self):
        if not self.__useCache:
            return None
        if not os.path.isfile('cache.bin'):
            return None
        data = open('cache.bin', 'r').read()
        data = zlib.decompress(data)
        data = pickle.loads(data)
        return data
  
    def __calculteCounts(self, samples, counts, counts1, clusters):
        cn = self.__clustersNumber
        for s in samples:
            currentCounts = {}
            for d in s[0]:
                currentCounts[clusters[self.__currentDescr]] = currentCounts.get(clusters[self.__currentDescr], 0) + 1
                self.__currentDescr += 1
            for clu, cnt in currentCounts.iteritems():
                counts[self.__currentSample, clu] = cnt
            for i, histCnt in enumerate(s[1]):
                counts1[self.__currentSample, i] = histCnt[0]
            self.__currentSample += 1
  
    def __printDistribution(self, clusters):
        if not self.__verbose:
            return
        distr = {}
        for c in clusters:
            distr[c] = distr.get(c, 0) + 1
        v = sorted(distr.values(), reverse=True)
        print 'distribution:', v[0:15], '...', v[-15:]
  
    def __addDescriptors(self, totalDescriptors, samples):
        for sample in samples:
            for descriptor in sample[0]:
                totalDescriptors.append(descriptor)
  
    def __loadSamples(self, files):
        samples = [[]] * len(files)
        n = 0
        for f in files:
            self.__queue.put((self.__loadSingleSample, (f, samples, n)))
            n += 1
        self.__queue.join()
        if _g_removed:
            print ' === REMOVED = TERMINATE'
            sys.exit(44)
        return samples
  
    def __loadSingleSample(self, args):
        global _g_removed
        fileName, samples, sampleNum = args
        des, hist = self.__getFeatures(fileName)
        if des is None:
            print 'ERROR: failed to load', fileName
            os.remove(fileName)
            _g_removed = True
            #sys.exit(44)
            des = []
            hist = [[0]] * 256
        samples[sampleNum] = (des, hist)
  
    def __getFeatures(self, fileName):
        fid = 'cache/' + str(zlib.crc32(fileName))
        self.__log('loading %s' % fileName)
        if os.path.isfile(fid):
            des, hist = pickle.loads(open(fid, 'rb').read())
        else:
            img = cv2.imread(fileName)
      
            if img.shape[1] > 1000:
                cf = 1000.0 / img.shape[1]
                newSize = (int(cf * img.shape[0]), int(cf * img.shape[1]), img.shape[2])
                img.resize(newSize)
      
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            s = cv2.SIFT(nfeatures = 400)
      
            d = cv2.DescriptorExtractor_create("OpponentSIFT")
            kp = s.detect(gray, None)
            kp, des = d.compute(img, kp)
      
            hist = self.__getColorHist(img)
      
            #open(fid, 'wb').write(pickle.dumps((des, hist), -1))
    
        return des, hist

    def __getColorHist(self, img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        dist = cv2.calcHist([hsv],[0],None,[256],[0,256])
        return dist


def loadDir(dirName):
    files = os.listdir(dirName)
    fnames = []
    for f in files:
        if not f.endswith('.jpg'):
            continue
        fileName = dirName + '/' + f
        fnames.append(fileName)
    return fnames

def loadFileLists():
    random.seed(FILE_SEED)
  
    positiveFiles = sorted(loadDir('2'))
    negativeFiles = sorted(loadDir('1'))
  
    random.shuffle(positiveFiles)
    random.shuffle(negativeFiles)
  
    minLen = min(len(positiveFiles), len(negativeFiles))
  
    p20 = int(0.2 * minLen)
  
    testFiles = positiveFiles[:p20] + negativeFiles[:p20]
    positiveFiles = positiveFiles[p20:]
    negativeFiles = negativeFiles[p20:]
  
    print testFiles[0], negativeFiles[0], positiveFiles[0]
  
    testFiles = loadDir('1test')
  
    return positiveFiles, negativeFiles, testFiles
  

def train():
    positiveFiles, negativeFiles, testFiles = loadFileLists()

    pcr = PCR()
    pcr.train(positiveFiles, negativeFiles)
    pcr.saveModel('model.bin')
  
  
def predict():
  
    positiveFiles, negativeFiles, testFiles = loadFileLists()
    testFiles = testFiles
  
    pcr = PCR()
    pcr.loadModel('model.bin')
    pred = pcr.predict(testFiles)
    total = 0
    correct = 0

    for i in xrange(0, len(testFiles)):
        isCorrect = ((testFiles[i][0] == '1' and not pred[i]) or (testFiles[i][0] == '2' and pred[i]))

        print isCorrect, pred[i], testFiles[i]
        #if not isCorrect:
        #  print testFiles[i]
        correct += int(isCorrect)

    
        total += 1
    print 'sum: \t', float(correct) / total

def predictTest(file):
    files = [file]
    pcr = PCR()
    pcr.loadModel('model.bin')
    pred = pcr.predict(files)
    print '\n\n ===', pred[0], '===\n\n'

def predictUrl(url):
    filename = 'test.jpg'
    f = open(filename,'wb')
    f.write(urllib2.urlopen(url).read())
    f.close()
    time.sleep(0.5)
    predictTest(filename)

def printUsage():
    print 'Usage: '
    print '  %s train                              - train model' % sys.argv[0]
    print '  %s url http://sample.com/img.jpg      - check given url' % sys.argv[0]
    sys.exit(42)

predictTest('./img/')#0_18.jpg

'''
if __name__ == '__main__':
  if len(sys.argv) < 2:
    printUsage()
  mode = sys.argv[1]
  if mode == 'train':
    train()
    time.sleep(0.5)
    predict()
  elif mode == 'url':
    if len(sys.argv) < 3:
      printUsage()
    url = sys.argv[2]
    predictUrl(url)
  else:
    printUsage()
'''
Global site tag (gtag.js) - Google Analytics