import requests
response = requests.get(網址,請求頭)
content = response.content.decode()
1,導入模塊,創建對象
from lxml import etree
obj = etree.HTML(html內容)
2,使用xpath方法提取內容
_Element對象 = obj.xpath(xpath式子)
列印的效果如下:
<class 'lxml.etree._Element'>
import requestsfrom lxml import etreeclass WuyouSpider: def __init__(self): """初始化操作""" self.url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" self.user_agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"} def start(self): """爬蟲程序啟動""" 把提取數據的工作,交給一個方法來處理 job_dict_list = self.get_data(content) 返回運行的結果 return job_dict_list def parse_url(self,url): """ 解析url,發起請求,獲取響應 :param url: 需要請求的網址 :return content: 請求所得到的內容 """ 獲取響應的內容 content = res.content.decode('gbk') return content def get_data(self, content): """ 從內容中提取有效的數據 :return: """ 數據容器,存儲內容 job_dict_list = list() 獲取崗位名稱 name = div.xpath("./p//a/@title")[0] job_url = div.xpath("./p//a/@href")[0] 工作地點 place = div.xpath("./span[2]/text()")[0] 發布日期 pub_date = div.xpath("./span[4]/text()")[0] 把構建好的數據,添加到容器中,就是那個列表 job_dict_list.append(job_info_dict) # 返回數據容器 return job_dict_listwu = WuyouSpider()print(wu)wu.start()