使用BeautifulSoup爬取某网站翻页数据并存入mysql
- 共 578 次检阅

# 爬虫
import requests
from bs4 import BeautifulSoup
import pymysql

def download_page(http_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    call_back = requests.get(http_url, headers=headers)
    return call_back.text

def get_page_content(html, page):
    conn = pymysql.connect('127.0.0.1', 'ceshi', 'ceshi', 'ceshi')
    cursor = conn.cursor()
    soup = BeautifulSoup(html, 'html.parser')
    con = soup.find('div', class_='plist')
    con_list = con.find_all('div', class_='pl')
    for item in con_list:
        a11 = item.find('div', class_='td-j-name').find('a').get_text()
        a22 = item.find('div', class_='td3 link_gray6').find('a').get_text()
        a33 = item.find('div', class_='td4').get_text()
        a55 = item.find('div', class_='td5').get_text()

        print(a11, a22, a33, a55)
        sql = 'INSERT INTO a123(zhiweis, gongsi, xinzhi, shijian) VALUE (%s,%s,%s,%s)'
        value = (a11, a22, a33, a55)
        cursor.execute(sql,value)
        conn.commit()
        # conn.close()
def main():
    for i in range(1, 1000):
        http_url = 'http://www.masaike.com/jobs/jobs_list/page/{}.htm'.format(i)
        html = download_page(http_url)
        get_page_content(html,i)


if __name__ == '__main__':
    main()

 

分享到:

这篇文章还没有评论

发表评论