共计 2454 个字符,预计需要花费 7 分钟才能阅读完成。
最近想着爬下豆瓣网,又不想通过登陆后保存cookies的方式登录,故想通过识别网上的验证码,用本身的账号密码登录。
在网上找了一些资料,通过观察豆瓣验证码图片后得到以下思路:
1、豆瓣网的验证码字幕的颜色比较单一,且与背景颜色差异较大,所以可以通过设定一个GRB的阈值来二值化图片灰度;
2、对二值化后的图片进行加窗滤波消除噪点;
3、对消除噪点后的图片进行分割;
分割思路:由于豆瓣验证码中的字母无连接,所以可以根据图片在纵轴像素点上累计的非白色点个数来划分字符。
4、对分割后的图片进行识别,可以是自建模板匹配,也可以利用有关的库进行匹配。
注:
1、在本文中由于库安装不全,就没有对分割后的图片做识别了。
2、关于识别率,对于字母笔画粗细均匀的字母,去噪的效果明显好于笔画粗细不均的字母,自然识别率也高一些。
3、基本效果如下:
不多说了,直接上代码。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from PIL import Image
#import pytesseract
WHITE = (255,255,255)
BLACK = (0,0,0)
#对图片做预处理,去除背景
def pre_concert(img):
width,height = img.size
threshold = 30
for i in range(0,width):
for j in range(0,height):
p = img.getpixel((i,j))#抽取每个像素点的像素
r,g,b = p
if r > threshold or g > threshold or b > threshold:
img.putpixel((i,j),WHITE)
else:
img.putpixel((i,j),BLACK)
img.show()
img.save("pre_fig.jpg")
return
#对去除背景的图片做噪点处理
def remove_noise(self, window=1):
if window == 1:
window_x = [1,0,0,-1,0]
window_y = [0,1,0,0,-1]
elif window == 2:
window_x = [-1,0,1,-1,0,1,1,-1,0]
window_y = [-1,-1,-1,1,1,1,0,0,0]
width,height = self.size
for i in range(width):
for j in range(height):
box = []
for k in range(len(window_x)):
d_x = i + window_x[k]
d_y = j + window_y[k]
try:
d_point = self.getpixel((d_x,d_y))
if d_point == BLACK:
box.append(1)
else:
box.append(0)
except IndexError:
self.putpixel((i,j),WHITE)
continue
box.sort()
if len(box) == len(window_x):
mid = box[int(len(box)/2)]
if mid == 1:
self.putpixel((i,j),BLACK)
else:
self.putpixel((i,j),WHITE)
self.show()
self.save("mov_noise_fig.jpg")
return
def split_fig(self):
frame = self.load()
img_new = self.copy()
frame_new = img_new.load()
width,height = self.size
line_status = None
pos_x = []
for x in range(width):
pixs = []
for y in range(height):
pixs.append(frame[x,y])
if len(set(pixs)) == 1:
_line_status = 0
else:
_line_status = 1
if _line_status != line_status:
if _line_status != None:
if _line_status == 0:
_x = x
elif _line_status == 1:
_x = x - 1
pos_x.append(_x)
#辅助线
for _y in range(height):
frame_new[x,_y] = BLACK
line_status = _line_status
img_new.show()
img_new.save("split_fig.jpg")
i = 0
divs = []
boxs = []
while True:
try:
x_i = pos_x[i]
x_j = pos_x[i+1]
except:
break
i = i + 2
boxs.append([x_i,x_j])
fixed_boxs = []
i = 0
while i < len(boxs):
box = boxs[i]
if box[1] - box[0] < 10:
try:
box_next = boxs[i+1]
fixed_boxs.append([box[0],box_next[1]])
i += 2
except Exception:
break
else:
fixed_boxs.append(box)
i += 1
for box in fixed_boxs:
div = self.crop((box[0],0,box[1],height))
try:
#divs.append(format_div(div,size=(20,40)))
divs.append(div)
except:
divs.append(div)
#过滤掉非字符的切片
_divs = []
for div in divs:
width,heigth = div.size
if width < 5:
continue
frame = div.load()
points = 0
for i in range(width):
for j in range(heigth):
p = frame[i,j]
if p == BLACK:
points += 1
if points <= 5:
continue
#new_div = format_div(div)
new_div = div
_divs.append(new_div)
return _divs
def image_to_string(img,config='-psm 8'):
try:
result = pytesseract.image_to_string(img,lang='eng',config=config)
result = result.strip()
return result.lower()
except:
return None
#测试代码
def main():
img = Image.open("fig2.jpg")
pre_concert(img)
remove_noise(img,2)
img1 = split_fig(img)
#image_to_string(img1,config='-psm 8')
if __name__ == '__main__':
main()
正文完
请博主喝杯咖啡吧!
自我点赞一波!