某CMS系统python爬虫 爬取信息存入指定数据库
- 共 641 次检阅

import requests
import time
from bs4 import BeautifulSoup
import pymysql


print("-------本脚本仅用于测试爬取数据,不得非法使用----------")
wangzhi = input("请输入网址不带(http://):")
pages = input("您要爬取多少页:")
httpseo = input("您输入的网址是否支持https(1是 2不是): ")
cookies = input("请输入cookie:")
jgtime = input("请输入间隔时间(单位:秒):")
# mysqlhost = input("请输入数据库地址:")
# mysqluser = input("请输入数据库用户名:")
# mysqlpwd = input("请输入数据库密码:")
# mysqldatabase = input("请输入数据库名:")
# mysqlbiao = input("请输入数据库表名:")

print("-------正在爬取数据......----------")
time.sleep(1)

if int(httpseo) == 1:
    httpss = 'https'
else:
    httpss = 'http'


# 使用前请先创建数据库
# 主键自增续添加 auto_increment primary key
# conn = pymysql.connect(mysqlhost, mysqluser, mysqlpwd, mysqldatabase)
# cursor = conn.cursor()
# creates = ('create table ' + mysqlbiao + ' (id int auto_increment primary key, zhiweis text, gongsi text, xinzhi text, shijian text) engine=innodb default charset=utf8;')
# cursor.execute(creates)
# print(creates)

def get_html(url):

    headers = {
          'Cookie': "'" + str(cookies) + "'",
          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return

def get_infos(html):

    soup = BeautifulSoup(html,"html.parser")

    ranks =soup.select('.td-j-name')
    names = soup.select('.line_substring')
    times = soup.select('.td4')
    shijian1 = soup.select('.td5')
    lianjies = soup.select('.td-j-name > a')

    # # 获取数据写入数据库
    # for r, n, t, k, u in zip(ranks, names, times, shijian1,lianjies):
    #         r = r.get_text()
    #         n = n.get_text()
    #         t = t.get_text()
    #         k = k.get_text()
    #         u = u.get("href")
    #         sql = "insert into " + mysqlbiao + " values(0,%s,%s,%s,%s)"
    #         value = (r, n, t, k)
    #         cursor.execute(sql, value)
    #         conn.commit()

    # 打印爬虫信息在控制台
    for r, n, t, k, u in zip(ranks, names, times, shijian1,lianjies):
            r = r.get_text()
            n = n.get_text()
            t = t.get_text()
            k = k.get_text()
            ss = '{}://{}/'.format(httpss, wangzhi)
            u = ss + u.get('href')
            data = {
                '职位': r,
                '公司名称': n,
                '薪资': t,
                '更新时间': k,
                'lianjie':u
                }
            print(data)
    print("--------存入mysql成功------------")
def main():
    urls = ['{}://{}/jobs/jobs_list/page/{}.html'.format(httpss, wangzhi, str(i)) for i in range(1, int(pages) + 1)]
    print(urls)
    for url in urls:
        html = get_html(url)
        get_infos(html)
        time.sleep(int(jgtime))
        #设置间隔时间,再次执行下一个网页,避免给服务器造成压力,也避免自己的计算机能力不够,运算过快死机
if __name__ == '__main__':
    main()
print("--------爬取数据库完成,关闭数据库连接------------")
# conn.close()

 

分享到:

这篇文章还没有评论

发表评论