|
星级打分
平均分:NAN 参与人数:0 我的评分:未评
本帖最后由 sdfhj 于 2021-10-18 19:04 编辑
为收集src,写了些爬虫,从网上爬写真图片。python写的,保存为.py文件使用。不知道为啥,代码里的“+”号没了。。请大家注意用的时候加一下
堆糖爬虫(爬柳岩):
- from urllib import request, parse
- import json
- import jsonpath
- import requests
- import os
- import re
- def mkdir(path):
- # 判断是否存在文件夹如果不存在则创建为文件夹
- # 如果路径不存在会创建这个路径
- folder = os.path.exists(path)
- if not folder:
- os.makedirs(path)
- def DuiTang(word, num, local_path):
- url_head = 'https://www.duitang.com/napi/blog/list/by_search/?kw='
- form = parse.quote(word)
- headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
- for page in range(1,int(num)):
- response = request.Request(url = url_head+form+'&type=feed&include_fields=top_comments,is_root,source_link,item,buyable,root_id,status,like_count,like_id,sender,album,reply_count,favorite_blog_id&_type=&start='+str(page*24)+'&_='+str(1540109911555+page), headers=headers)
-
- response = request.urlopen(response)
- html = response.read()
- unicodestr = json.loads(html)
- url_list = jsonpath.jsonpath(unicodestr, "$..path")
- for li in url_list:
- print('\n')
- print(li)
- name = li.split('/')[-1]
- try:
- filename = local_path + '/' + name
- if not os.path.isfile(filename):
- document = requests.get(url=li, headers=headers)
- with open(filename, 'wb') as f:
- f.write(document.content)
- print('已下载完成')
- else:
- print('已存在')
- except Exception:
- print('下载失败并忽略')
- pass
-
-
- if __name__ == '__main__':
- word = '柳岩'
- num = 200
- local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\DFL_Src\\duitang\" + word
- mkdir(local_path)
- DuiTang(word, num, local_path)
复制代码 回车桌面爬虫(爬范冰冰):- import requests
- import re
- import os
- from lxml import etree
- from faker import Faker
- faker = Faker(locale='zh_CN')
- def mkdir(path):
- folder = os.path.exists(path)
- if not folder:
- os.makedirs(path)
- def get(url):
- header = {'User-Agent': faker.user_agent()}
- i = 0
- while i < 1:
- try:
- result = requests.get(url, headers=header, timeout=10)
- return result
- except requests.exceptions.RequestException:
- # print("Time out " + str(i+1))
- i += 1
- word = '范冰冰'
- local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\DFL_Src\\huiche\" + word
- mkdir(local_path)
- url = 'https://www.enterdesk.com/search/1-0-6-0-0-0/' + word
- while True:
- print(url)
- response = get(url)
- get_urls = etree.HTML(response.text).xpath('//dl[@class="egeli_pic_dl"]/dd/a/@href')
- if len(get_urls) == 0:
- break
- if get_urls is None:
- break
- for get_url in get_urls:
- print(get_url)
- response = get(get_url)
- imgs = etree.HTML(response.text).xpath('/html/body/div[10]/div[1]/div[4]/div/div[1]/div/a/@src')
- img_single = etree.HTML(response.text).xpath('/html/body/div[10]/div[1]/div[2]/img/@src')
- if len(img_single) == 1:
- imgs.append(img_single[0])
- for img in imgs:
- print(img)
- name = img.split('/')[-1]
- try:
- filename = local_path + '/' + name
- if not os.path.isfile(filename):
- document = get(img)
- with open(filename, 'wb') as f:
- f.write(document.content)
- print('已下载完成')
- else:
- print('已存在')
- except Exception:
- print('下载失败并忽略')
- pass
- page_num = re.search('search/(.*)-0-6-0-0-0/', url).group(1)
- new_page_num = int(page_num) + 1
- url = url.replace('search/' + str(page_num) + '-0-6-0-0-0/', 'search/' + str(new_page_num) + '-0-6-0-0-0/')
-
复制代码
|
|