TensorFlow中创建变量要调用Variable方法传入变量的值,在运算的时候也要遵循TensorFlow…
贝叶斯原理及实例(拼写检查器)
http://blog.csdn.net/kong1287988804/article/details/779…
关联分析Apriori、FP-Growth算法
自然界中某种事物发生时其他事物也会发生的这样一种联系称之为关联。 反映事件之间依赖或关联的知识称为关联型知识(…
鸢尾花分类问题实战(knn,贝叶斯)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import pandas as pda import numpy as npy #加载数据1 filename="D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第3次课代码/数据/iris.csv" dataf=pda.read_csv(filename) x=dataf.iloc[:,0:4].as_matrix() y=dataf.iloc[:,4:5].as_matrix() #加载数据2 from sklearn import datasets irisdata=datasets.load_iris() x=irisdata.data y=irisdata.target from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=20) #使用KNN算法进行分类(1) from numpy import * import operator def knn(k,testdata,traindata,labels): #testdata:一维数组[0,0,1,……] #traindata:二维数组[[0,0,1……],[],] #labels:一维列表,跟traindata一一对应 #以下shape取的是训练数据的第一维,即其行数,也就是训练数据的个数 traindatasize=traindata.shape[0] dif=tile(testdata,(traindatasize,1))-traindata #tile()的意思是给一维的测试数据转为与训练数据一样的行和列的格式 #print(tile(testdata,(traindatasize,1))) sqdif=dif**2 #axis=1-----》横向相加的意思 sumsqdif=sqdif.sum(axis=1) #sumsqdif在此时已经成为1维的了 distance=sumsqdif**0.5 sortdistance=distance.argsort() #sortdistance为测试数据到各个训练数据的距离按近到远排序之后的结果 count={} for i in range(0,k): vote=labels[sortdistance[i]] #sortdistance[i]测试数据最近的K个训练数据的下标 #vote测试数据最近的K个训练数据的类别 count[vote]=count.get(vote,0)+1 sortcount=sorted(count.items(),key=operator.itemgetter(1),reverse=True) return sortcount[0][0] y2=[] for i in range(0,len(x_test.tolist())): predict=knn(3,x_test.tolist()[i],x_train,y_train.tolist()) y2.append(predict) #使用KNN算法进行分类(2) from sklearn.neighbors import KNeighborsClassifier model=KNeighborsClassifier() model.fit(x_train,y_train) y2=model.predict(x_test) #使用贝叶斯算法进行分类(1) import numpy as npy import numpy from numpy import * import operator from os import listdir class Bayes: def __init__(self): self.length=-1 self.labelcount=dict()#各类别的概率{"类别1":概率1,"类别2":概率2,…} self.vectorcount=dict()#以字典的方式存储类别与特征向量, #格式为{"类别1":[特征向量1,特征向量2,…],…,"类别n":[特征向量1,特征向量2,…]} def fit(self,dataSet:list,labels:list): if(len(dataSet)!=len(labels)): raise ValueError("您输入的训练数组跟类别数组长度不一致") self.length=len(dataSet[0])#测试数据特征值的长度 labelsnum=len(labels)#类别所有的数量(可重复) norlabels=set(labels)#不重复类别的数量 for item in norlabels: thislabel=item self.labelcount[thislabel]=labels.count(thislabel)/labelsnum#求的当前类别占类别总数的比例,p(c) #通过zip将两个数组交叉放置,比如: ''' x1=[a,b,c] x2=[e,f,g] k=zip(x1,x2) k:[(a,e),(b,f),(c,g)] ''' for vector,label in zip(dataSet,labels): if(label not in self.vectorcount): self.vectorcount[label]=[] self.vectorcount[label].append(vector) print("训练结束") return self def btest(self,TestData,labelsSet): if(self.length==-1): raise ValueError("您还没有进行训练,请先训练") #计算testdata分别为各个类别的概率 lbDict=dict()#{"类别1":概率1,“类别2”:概率2} for thislb in labelsSet: p=1 alllabel=self.labelcount[thislb]#当前类别的概率p(c) allvector=self.vectorcount[thislb]#当前类别的所有特征向量 vnum=len(allvector)#当前类别特征向量个数 allvector=numpy.array(allvector).T#转置一下 #如 ''' 原来的是[[t1,t2,t3,t4],[t1,t2,t3,t4]] 现在成为:[t1,t1],[t2,t2]……[t4,t4] ''' for index in range(0,len(TestData)):#依次计算各特征的概率 vector=list(allvector[index]) p*=vector.count(TestData[index])/vnum#p(当前特征|C) #如testdata[0]:0,vector:[1,0,1,0,0,1,1],p为3/7 lbDict[thislb]=p*alllabel#alllabel相当于p(c) thislabel=sorted(lbDict,key=lambda x:lbDict[x],reverse=True)[0] return thislabel bys=Bayes() bys.fit(x_train.tolist(),y_train.tolist()) labels=[0,1,2] y2=[] for i in range(0,len(x_test.tolist())): predict=bys.btest(x_test.tolist()[i],labels) y2.append(predict) #使用贝叶斯算法进行分类(2) from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(x_train,y_train) expected = y_test predicted = model.predict(x_test) |
贝叶斯算法
#特点:不适用于特征之间有关联 朴素贝叶斯算法 P(B|A)=P(A|B)P(B)/P(A) 价格(A) 课时…
数据预处理代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
#导入数据 import pymysql import numpy as npy import pandas as pda import matplotlib.pylab as pyl conn=pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="csdn") sql="select * from taob" data=pda.read_sql(sql,conn) #数据清洗 #发现缺失值 x=0 data["price"][(data["price"]==0)]=None for i in data.columns: for j in range(len(data)): if(data[i].isnull())[j]: data[i][j]="64" x+=1 #print(x) #异常值处理 #找到异常值 #画散点图(横轴:价格,纵轴:评论数) data2=data.T price=data2.values[2] comt=data2.values[3] pyl.plot(price,comt,"o") pyl.show() #处理异常数据 #评论大于100000,价格大于1000都处理掉 line=len(data.values) col=len(data.values[0]) da=data.values '''改值处理法''' ''' for i in range(0,line): for j in range(0,col): #评论 if(j==3 and da[i][3]>65): #修改值处理法 da[i][j]=35 #价格 elif(j==2 and da[i][2]>100): #修改值处理法 da[i][j]=64 ''' '''删除处理法''' x=0 for i in range(0,line): for j in range(0,col): #评论 if(da[i][3]>65): #删除处理法 continue #价格 elif(da[i][2]>100): #删除处理法 continue else: #删除处理法 if(x==0): newda=da[i] else: newda=npy.row_stack((newda,da[i])) x+=1 #da2=da.T da2=newda.T price=da2[2] comt=da2[3] pyl.plot(price,comt,"o") pyl.show() #分布分析 ''' 求最值 计算极差 组距:极差/组数 绘制直方图 ''' pricemax=da2[2].max() pricemin=da2[2].min() commentmax=da2[3].max() commentmin=da2[3].min() #极差 pricerg=pricemax-pricemin commentrg=commentmax-commentmin #组距 pricedst=pricerg/13 commentdst=commentrg/13 #绘制价格直方图 #npy.arrange(最小,最大,组距) pricesty=npy.arange(pricemin,pricemax,pricedst) pyl.hist(da2[2],pricesty) pyl.show() #绘制评论数直方图 commentsty=npy.arange(commentmin,commentmax,commentdst) pyl.hist(da2[2],commentsty) pyl.show() |
中文手写体识别
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from skimage import transform import pandas as pda import os import skimage.io as io import skimage.color as cl import skimage.exposure as expos def readimg(path,xsize,ysize): #读取图片 img=io.imread(path) #灰度处理 img_gray=cl.rgb2gray(img) #调整为44*32 img_resize=transform.resize(img_gray,(xsize,ysize)) #直方图均衡化处理 img_hist=expos.equalize_hist(img_resize) #二维数组转一维 img_vector=img_hist.flatten() return img_vector.tolist() #img1=readimg("D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第7次课代码/中文识别数据集/训练数据/中/0.jpg",44,32) #批量读取训练数据 train_x=[] train_y=[] zhong_all=os.listdir("C:/Users/Administrator/Desktop/python技巧/图像识别/训练数据/中/") for i in zhong_all: path="C:/Users/Administrator/Desktop/python技巧/图像识别/训练数据/中/"+str(i) thisdata=readimg(path,44,32) train_x.append(thisdata) train_y.append(0) guo_all=os.listdir("C:/Users/Administrator/Desktop/python技巧/图像识别/训练数据/国/") for i in guo_all: path="C:/Users/Administrator/Desktop/python技巧/图像识别/训练数据/国/"+str(i) thisdata=readimg(path,44,32) train_x.append(thisdata) train_y.append(1) xf=pda.DataFrame(train_x) yf=pda.DataFrame(train_y) train_x=xf.as_matrix().astype(float) train_y=yf.as_matrix().astype(float) #建立神经网络模型 from keras.models import Sequential from keras.layers.core import Dense,Activation model=Sequential() #输入层 model.add(Dense(10,input_dim=len(train_x[0]))) model.add(Activation("relu")) #输出层 model.add(Dense(1,input_dim=3)) #模型的编译categorical_crossentropy model.compile(loss="mean_squared_error",optimizer="adam") #训练 model.fit(train_x,train_y,nb_epoch=100,batch_size=100) #---------- #实现预测 #测试一下“中”能否识别 # test_x=[] # zhong_test_all=os.listdir("D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第7次课代码/中文识别数据集/测试数据/中/") # for i in zhong_test_all: # path="D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第7次课代码/中文识别数据集/测试数据/中/"+str(i) # thisdata=readimg(path,44,32) # test_x.append(thisdata) # xf=pda.DataFrame(test_x) # test_x=xf.as_matrix().astype(float) # rst=model.predict_classes(test_x) # for i in rst: # if(str(i[0])=="0"): # print("识别结果:中") # else: # print("识别结果:国") #测试一下“国”能否识别 # test_x=[] # guo_test_all=os.listdir("D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第7次课代码/中文识别数据集/测试数据/国/") # for i in guo_test_all: # path="D:/我的教学/Python/CSDN数据挖掘&机器学习升级版/第7次课代码/中文识别数据集/测试数据/国/"+str(i) # thisdata=readimg(path,44,32) # test_x.append(thisdata) # xf=pda.DataFrame(test_x) # test_x=xf.as_matrix().astype(float) # rst=model.predict_classes(test_x) # for i in rst: # if(str(i[0])=="0"): # print("识别结果:中") # else: # print("识别结果:国") #测试一下测试2中的数据能否正常输出 # print("------接下来是测试2中的图片识别结果----------") test_x=[] test2_all=os.listdir("C:/Users/Administrator/Desktop/python技巧/图像识别/测试数据2/") for i in range(0,len(test2_all)): path="C:/Users/Administrator/Desktop/python技巧/图像识别/测试数据2/"+str(i)+".jpg" thisdata=readimg(path,44,32) test_x.append(thisdata) xf=pda.DataFrame(test_x) test_x=xf.as_matrix().astype(float) rst=model.predict_classes(test_x) print("识别结果:",end="") for i in rst: if(str(i[0])=="0"): print("中",end="") else: print("国",end="") |
关于Python图像片处理的一些技巧
之前用Python写的手机评论爬取工具是调用adb对手机截屏之后分析图像,然后通过模拟滑动和百度的ocr图像文…
(原创)同花顺手机版股票评论爬取代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import urllib.request, sys,base64,json,os,time from PIL import Image from aip import AipOcr import numpy as np def cutImage(x,y,dx,dy,f1,f2): im = Image.open(f1) img_size = im.size # w = im.size[0] # h = im.size[1] print("xx:{}".format(img_size)) region = im.crop((x,y,x+dx,x+dy)) #裁剪的区域 region.save(f2) def jisuan(f1): dingbu1_x=[] dingbu1_y=[] dingbu2_x=[] dingbu2_y=[] #寻找评论结束行(不包括评论的评论) pljieshu_x=[] pljieshu_y=[] im2=Image.open(f1) w2,h2=im2.size im_pixel=im2.load() # print(im_pixel) scan_start_y = 0 # 扫描的起始 y 坐标 #以20px步长扫描圆形头像 for i in range(1,int(h2),10): last_pixel=im_pixel[0,i] for j in range(1,int(w2)): pixel=im_pixel[j,i] # 不是纯色的线,则记录 scan_start_y 的值,准备跳出循环 #0,1,2对应RGB三个通道数值 if pixel[0] != last_pixel[0] or pixel[1] != last_pixel[1] or pixel[2] != last_pixel[2]: scan_start_y = i - 10 break if scan_start_y: break # print('scan_start_y: {}'.format(scan_start_y)) #默认第一个圆形头像y轴范围不超过300 for i in range(scan_start_y,300): for j in range(50,180): #横坐标也减少了扫描开销 pixel=im_pixel[j,i] #获取顶点坐标 if (pixel[0] != 23 and pixel[1] !=22 and pixel[2] !=22): dingbu1_x.append(j) dingbu1_y.append(i) break dingbu1_x=dingbu1_x[0] dingbu1_y=dingbu1_y[0] dibu_x=180 #头像右下角x值 dibu_y=dingbu1_y+140 #头像右下角y值 detaX=1200 #第二个头像的位置 for i in range(dingbu1_y+100,1000): pixel=im_pixel[100,i] if (pixel[0] != 23 and pixel[1] !=22 and pixel[2] !=22): dingbu2_x.append(100) dingbu2_y.append(i) break dingbu2_x=dingbu2_x[0] dingbu2_y=dingbu2_y[0] for i in range(dibu_y,dingbu2_y): for j in range(180,1200): pixel=im_pixel[j,i] if pixel[0]==32 and pixel[1]==32 and pixel[2]==34: pljieshu_x.append(j) pljieshu_y.append(i) break yidongjuli=dingbu2_y-dingbu1_y if len(pljieshu_x)==0: if yidongjuli>530: detaY=323 else: detaY=yidongjuli*0.6985 else: pljieshu_x=pljieshu_x[0] pljieshu_y=pljieshu_y[0] detaY=pljieshu_y-dibu_y # print(dingbu1_x) # print(dingbu1_y) # print(dingbu2_x) # print(dingbu2_y) # print(pljieshu_x) # print(pljieshu_y) return dibu_x,dibu_y,detaX,detaY,yidongjuli """ 读取图片 """ def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() def shibiebingbaocun(image): respon = client.basicGeneral(image) #用完500次后可改respon = client.basicAccurate(image) comments = respon['words_result'] #获取评论 ans=" " for comment in comments: ans = ans +comment['words'] with open('./comments.txt','a') as f: f.write(ans) print(ans) def huadong(swipe_x1,swipe_y1,swipe_x2,swipe_y2): cmd = 'adb shell input swipe {x1} {y1} {x2} {y2} '.format( x1=swipe_x1, y1=swipe_y1, x2=swipe_x2, y2=swipe_y2, ) print(cmd) os.system(cmd) if __name__ == '__main__': start = time.time() """ (百度ocr)你的 APPID AK SK """ APP_ID = '10686092' API_KEY = '72HAGKk7aE3aVooGHvoqEFnx' SECRET_KEY = 'hfhdxHhhOr7lIreYrXcS31a0PWRyKEbl' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) #采集评论个数 for i in range(0,300): os.system("adb shell /system/bin/screencap -p /sdcard/screenshot.png") os.system("adb pull /sdcard/screenshot.png ./screenshot.png") cutImage(0,600,1440,1400,r"./screenshot.png",r"./crop_test1.png") #一次裁剪 dibu_x,dibu_y,detaX,detaY,yidongjuli=jisuan(r"./crop_test1.png") #二次裁剪 print(dibu_x,dibu_y,detaX,detaY,yidongjuli) cutImage(dibu_x,dibu_y,detaX,detaY,r"./crop_test1.png",r"./crop_test1.png") image = get_file_content(r"./crop_test1.png") shibiebingbaocun(image) huadong(700,1700,700,1700-yidongjuli*0.88) time.sleep(1) end = time.time() print('程序用时:'+str(end-start)+'秒') |
利用ReliefF算法对UCI,BreastCancer Wisconsin乳腺癌数据集提取特征
本例实验数据来自著名的UCI机器学习数据库,该数据库有大量的人工智能数据挖掘数据,网址为http://arch…