本文共 1879 字,大约阅读时间需要 6 分钟。
import reimport urllib.requestimport osimport numpy as npdef craw(url,page,savedir): html1=urllib.request.urlopen(url).read() html1=str(html1) pat1='' result1=re.compile(pat1).findall(html1) result1=result1[0] pat2 ='source-data-lazy-img="(//.*?jpg)' imag = re.compile(pat2).findall(result1) x = 1 for imagurl in imag: imagname = savedir + '第'+str(page)+'页' + '第'+str(x)+'个' + '.jpg' imagurl = 'https:' + imagurl try: urllib.request.urlretrieve(imagurl,filename=imagname) print('已输出第',page,'页,第',x,'个') except urllib.error.URLError as e: if hasattr(e,'code'): x+=1 if hasattr(e,'reason'): x+=1 x+=1if __name__ =="__main__": page_= 45 key = ['衬衫','马甲衬衫','马甲','女生职业装','女士西服'] for k in range(len(key)): if os.path.exists('./img/' + key[k]) == False: os.makedirs('./img/' + key[k]) savedir = './img/' + key[k] + '/' name = key[k] for i in range(1,2*page_+1): if i%2==0: key2=i/2+0.5 else: key2=(i+1)/2 key1=name key_temp=urllib.request.quote(key1) url2='https://search.jd.com/Search?keyword='+key_temp +'&enc=utf-8&page='+ str(i) # 模拟浏览器 req = urllib.request.Request(url2) req.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") req.add_header("Accept-Encoding","gzip, deflate, br") req.add_header("Accept-Language","zh-CN,zh;q=0.9") req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36") url=urllib.request.Request(url2) craw(url,key2,savedir)
本代码为网上所找,网址不记得了。原作可见请告知。
转载地址:http://tuhws.baihongyu.com/