其他
爬虫分享——公司信息爬虫
前面也分享了比较多的机器学习方面的内容,这次来点开发类的技术分享吧。这次给大家带来一个香港公司相关信息的爬虫分享,由于是爬虫类的分享内容,我们大致过过整个流程后,就直接上代码吧~
整体项目流程:
熟悉页面结构,以及了解需要获取的相关字段内容
通过scrapy框架进行获取、解析页面内容
将相关数据进行结构化存储进mysql数据库中
成果展示
一、首先熟悉页面结构,以及了解需要获取的相关字段内容:
1、进入目标网站我们可以找到相关的行业,这里是我们要爬取的一级页面
2、然后我们再遍历进入对应的行业,即可进入到二级页面内容
3、接着再遍历每个公司,即可进入到三级页面内容
上述就是我们这次要爬取及遍历的目标,爬取后再将相应字段进行解析后即可相关内容。
二、通过scrapy框架进行获取、解析页面内容(spider文件)
import scrapy
from .util import *
from lxml import etree
from company_crawl.items import CompanyCrawlItem
class HkCompanySpider(scrapy.Spider):
name = 'hk_company'
start_urls = ['https://hongkong.mingluji.com/Industry_Index']
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def start_requests(self):
yield scrapy.FormRequest(url=self.start_urls[0], headers=self.headers, callback=self.parse_url_one)
def parse_url_one(self, response):
"""
获取所有行业的名称、url、公司数量、页数
"""
html = etree.HTML(response.text)
all_industry_name_list = html.xpath('//*[@id="mw-content-text"]/table/tr/td[2]/a/text()')
all_industry_url_list = html.xpath('//*[@id="mw-content-text"]/table/tr/td[2]/a/@href')
all_industry_url_list_new = list(map(lambda x: "https://hongkong.mingluji.com" + str(x), all_industry_url_list))
all_industry_company_num = html.xpath('//*[@id="mw-content-text"]/table/tr/td[3]/text()')
all_industry_company_num = list(map(lambda x: int(x.split(' ')[0].replace(',', '')), all_industry_company_num))
all_industry_company_page_num = html.xpath('//*[@id="mw-content-text"]/table/tr/td[4]/text()')
all_industry_company_page_num = list(
map(lambda x: int(x.split(' ')[0].replace(',', '')), all_industry_company_page_num))
all_content_level_1 = list(zip(all_industry_name_list, all_industry_url_list_new,
all_industry_company_num, all_industry_company_page_num))
for content in all_content_level_1[:]:
industry_name = content[0]
industry_url = content[1]
company_nums = content[2]
page_num = content[-1]
for page in range(page_num):
yield scrapy.FormRequest(industry_url + '/' + str(page), headers=self.headers, callback=self.parse_url_two, meta={'industry_name':industry_name,
'company_nums':company_nums, 'industry_pages':page_num})
def parse_url_two(self, response):
industry_name = response.meta['industry_name']
company_nums = response.meta['company_nums']
industry_pages = response.meta['industry_pages']
html_1 = etree.HTML(response.text)
all_company_url_list = html_1.xpath('//*[@id="mw-content-text"]/table/tr/td/ol/li/a/@href')
all_company_url_list_new = list(map(lambda x:"https://hongkong.mingluji.com"+str(x), all_company_url_list))
all_company_name_list = html_1.xpath('//*[@id="mw-content-text"]/table/tr/td/ol/li/a/text()')
all_company_content_list = list(zip(all_company_url_list_new, all_company_name_list))
for company_content in all_company_content_list:
company_name = company_content[1]
company_url = company_content[0]
yield scrapy.FormRequest(url=company_url, headers=self.headers, callback=self.parse_url_three, meta={'industry_name':industry_name, 'company_nums':company_nums,
'industry_pages':industry_pages, 'company_url':company_url})
def parse_url_three(self, response):
res_content = get_level_3_content(response.text)
item = CompanyCrawlItem()
item['company_name'] = res_content[0]
item['company_url'] = response.meta['company_url']
item['contact_person'] = res_content[1]
item['contact_person_job'] = res_content[2]
item['company_address'] = res_content[3]
item['company_telephone'] = res_content[4]
item['company_fax_number'] = res_content[5]
item['company_email'] = res_content[6]
item['company_country'] = res_content[7]
item['company_postal_code'] = res_content[-1]
item['industry_pages'] = response.meta['industry_pages']
item['industry_name'] = response.meta['industry_name']
yield item
# -*- coding: utf-8 -*-
# @Organization :
# @Author : hhx
# @Time : 2023/1/27 11:49 上午
# @Email : phhx223@163.com
import re
"""
正则中.无法代替\n,\s,此时得使用[\s\S]既能代表全部字符
该目录为三级页面的解析代码
"""
def get_level_3_content(url_text):
url_text = str(url_text)
def exit_or_not(list_):
pattern_str_sub = re.compile('<.*?>')
if list_:
return re.sub(pattern_str_sub, '', list_[0]).replace('\n', '')
else:
return None
pattern_company_name = re.compile('公司名稱[\s\S]*?"name">(.*?)<')
pattern_contact_person = re.compile('聯繫人員[\s\S]*?"name">(.*?)<')
pattern_job_title = re.compile('工作職務[\s\S]*?"jobTitle">(.*?)<')
pattern_Address = re.compile('辦公地址[\s\S]*?"address">(.[\s\S]*?)</dd>')
pattern_telephone = re.compile('電話號碼[\s\S]*?"telephone">(.[\s\S]*?)<')
pattern_fax_number = re.compile('傳真號碼[\s\S]*?"faxNumber">(.[\s\S]*?)<')
pattern_email = re.compile('電子郵箱[\s\S]*?"email">(.[\s\S]*?)<')
pattern_Website_url = re.compile('網站網址[\s\S]*?"location">(.[\s\S]*?)<')
pattern_postal_code = re.compile('郵政編碼[\s\S]*?>(.[\s\S]*?)</dd>')
company_name = exit_or_not(re.findall(pattern_company_name, url_text))
contact_person = exit_or_not(re.findall(pattern_contact_person, url_text))
job_title = exit_or_not(re.findall(pattern_job_title, url_text))
Address = exit_or_not(re.findall(pattern_Address, url_text))
telephone = exit_or_not(re.findall(pattern_telephone, url_text))
fax_number = exit_or_not(re.findall(pattern_fax_number, url_text))
email = exit_or_not(re.findall(pattern_email, url_text))
Website_url = exit_or_not(re.findall(pattern_Website_url, url_text))
postal_code = exit_or_not(re.findall(pattern_postal_code, url_text))
return [company_name, contact_person, job_title, Address, telephone,
fax_number, email, Website_url, postal_code]
三、将相关数据进行结构化存储进mysql数据库中
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import datetime
import pymysql
class CompanyCrawlPipeline:
def __init__(self):
self.now = str(datetime.date.today())
self.connect = pymysql.connect(host='localhost', user='root', password='*******', db='Crawl_info', port=3306)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
company_name = item['company_name']
company_url = item['company_url']
contact_person = item['contact_person']
contact_person_job = item['contact_person_job']
company_address = item['company_address']
company_telephone = item['company_telephone']
company_fax_number = item['company_fax_number']
company_email = item['company_email']
company_country = item['company_country']
company_postal_code = item['company_postal_code']
industry_pages = item['industry_pages']
industry_name = item['industry_name']
self.cursor.execute('insert ignore into hk_company(company_name, company_url, \
contact_person, contact_person_job, company_address, company_telephone, company_fax_number, company_email, \
company_country, company_postal_code, industry_pages, industry_name, crawl_time)values \
("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(company_name, company_url, contact_person, contact_person_job,
company_address, company_telephone, company_fax_number, company_email, company_country, company_postal_code,
industry_pages, industry_name, self.now))
self.connect.commit()
return item
def spider_closed(self):
self.cursor.close()
self.connect.close()
友情提示:切记勿作商业用途,仅供个人学习使用!有关代码的地方,如有不懂的地方欢迎留言!