目標站點
第一步:新建專案
keysdemacbook:desktop keys$ scrapy startproject mycrawlnew scrapy project 'mycrawl', using template directory '/library/frameworks/python.framework/versions/3.6/lib/python3.6/site-packages/scrapy/templates/project', created in:
/users/keys/desktop/mycrawl
you can start your first spider with:
cd mycrawl
scrapy genspider example example.com
第二步:建立爬蟲
keysdemacbook:desktop keys$ cd mycrawl/keysdemacbook:mycrawl keys$ scrapy genspider firstspider www.shushu8.com/huanhaichenfu
第三步:配置item.py
import scrapyclass mycrawlitem(scrapy.item):
url = scrapy.field()
title = scrapy.field()
text = scrapy.field()
第四步:編寫爬蟲
# -*- coding: utf-8 -*-import scrapy
from mycrawl.items import mycrawlitem
class firstspiderspider(scrapy.spider):
name = 'firstspider'
allowed_domains = ['www.shushu8.com/huanhaichenfu']
start_urls = [''+str(i+1) for i in range(502)]
def parse(self, response):
url = response.url
title = response.xpath('//*[@id="main"]/div[2]/div/div[1]/h1/text()').extract_first('')
text = response.css('#content::text').extract()
myitem = mycrawlitem()
myitem['url'] = url
myitem['title'] = title
myitem['text'] = ','.join(text)
yield myitem
第五步:配置pipeline.py
# -*- coding: utf-8 -*-import pymysql
class mysqlpipeline(object):
# 採用同步的機制寫入mysql
def __init__(self):
self.conn = pymysql.connect(
'127.0.0.1',
'root',
'rootkeys',
'article',
charset="utf8",
use_unicode=true)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into huanhaichenfu(url, title, text)
values (%s, %s, %s)
"""# 使用values實現傳值
self.cursor.execute(
insert_sql,
(item["url"],
item["title"],
item["text"]))
self.conn.commit()
第六步:配置setting.py
# -*- coding: utf-8 -*-bot_name = 'mycrawl'
spider_modules = ['mycrawl.spiders']
newspider_module = 'mycrawl.spiders'
robotstxt_obey = false
item_pipelines =
第七步:執行爬蟲
import osimport sys
from scrapy.cmdline import execute
run_spider = 'firstspider'
if __name__ == '__main__':
print('running spider of ' + run_spider)
execute(['scrapy', 'crawl', run_spider])