import requests
import time
from bs4 import BeautifulSoup
import pymysql
print("-------本脚本仅用于测试爬取数据,不得非法使用----------")
wangzhi = input("请输入网址不带(http://):")
pages = input("您要爬取多少页:")
httpseo = input("您输入的网址是否支持https(1是 2不是): ")
cookies = input("请输入cookie:")
jgtime = input("请输入间隔时间(单位:秒):")
# mysqlhost = input("请输入数据库地址:")
# mysqluser = input("请输入数据库用户名:")
# mysqlpwd = input("请输入数据库密码:")
# mysqldatabase = input("请输入数据库名:")
# mysqlbiao = input("请输入数据库表名:")
print("-------正在爬取数据......----------")
time.sleep(1)
if int(httpseo) == 1:
httpss = 'https'
else:
httpss = 'http'
# 使用前请先创建数据库
# 主键自增续添加 auto_increment primary key
# conn = pymysql.connect(mysqlhost, mysqluser, mysqlpwd, mysqldatabase)
# cursor = conn.cursor()
# creates = ('create table ' + mysqlbiao + ' (id int auto_increment primary key, zhiweis text, gongsi text, xinzhi text, shijian text) engine=innodb default charset=utf8;')
# cursor.execute(creates)
# print(creates)
def get_html(url):
headers = {
'Cookie': "'" + str(cookies) + "'",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return
def get_infos(html):
soup = BeautifulSoup(html,"html.parser")
ranks =soup.select('.td-j-name')
names = soup.select('.line_substring')
times = soup.select('.td4')
shijian1 = soup.select('.td5')
lianjies = soup.select('.td-j-name > a')
# # 获取数据写入数据库
# for r, n, t, k, u in zip(ranks, names, times, shijian1,lianjies):
# r = r.get_text()
# n = n.get_text()
# t = t.get_text()
# k = k.get_text()
# u = u.get("href")
# sql = "insert into " + mysqlbiao + " values(0,%s,%s,%s,%s)"
# value = (r, n, t, k)
# cursor.execute(sql, value)
# conn.commit()
# 打印爬虫信息在控制台
for r, n, t, k, u in zip(ranks, names, times, shijian1,lianjies):
r = r.get_text()
n = n.get_text()
t = t.get_text()
k = k.get_text()
ss = '{}://{}/'.format(httpss, wangzhi)
u = ss + u.get('href')
data = {
'职位': r,
'公司名称': n,
'薪资': t,
'更新时间': k,
'lianjie':u
}
print(data)
print("--------存入mysql成功------------")
def main():
urls = ['{}://{}/jobs/jobs_list/page/{}.html'.format(httpss, wangzhi, str(i)) for i in range(1, int(pages) + 1)]
print(urls)
for url in urls:
html = get_html(url)
get_infos(html)
time.sleep(int(jgtime))
#设置间隔时间,再次执行下一个网页,避免给服务器造成压力,也避免自己的计算机能力不够,运算过快死机
if __name__ == '__main__':
main()
print("--------爬取数据库完成,关闭数据库连接------------")
# conn.close()
这篇文章还没有评论