Scrapy應用之抓取《宦海沉浮》小說

2023-02-07 13:15:59 字數 2642 閱讀 5775

目標站點

第一步:新建專案

keysdemacbook:desktop keys$ scrapy startproject mycrawl

new scrapy project 'mycrawl', using template directory '/library/frameworks/python.framework/versions/3.6/lib/python3.6/site-packages/scrapy/templates/project', created in:

/users/keys/desktop/mycrawl

you can start your first spider with:

cd mycrawl

scrapy genspider example example.com

第二步:建立爬蟲

keysdemacbook:desktop keys$ cd mycrawl/

keysdemacbook:mycrawl keys$ scrapy genspider firstspider www.shushu8.com/huanhaichenfu

第三步:配置item.py

import scrapy

class mycrawlitem(scrapy.item):

url = scrapy.field()

title = scrapy.field()

text = scrapy.field()

第四步:編寫爬蟲

# -*- coding: utf-8 -*-

import scrapy

from mycrawl.items import mycrawlitem

class firstspiderspider(scrapy.spider):

name = 'firstspider'

allowed_domains = ['www.shushu8.com/huanhaichenfu']

start_urls = [''+str(i+1) for i in range(502)]

def parse(self, response):

url = response.url

title = response.xpath('//*[@id="main"]/div[2]/div/div[1]/h1/text()').extract_first('')

text = response.css('#content::text').extract()

myitem = mycrawlitem()

myitem['url'] = url

myitem['title'] = title

myitem['text'] = ','.join(text)

yield myitem

第五步:配置pipeline.py

# -*- coding: utf-8 -*-

import pymysql

class mysqlpipeline(object):

# 採用同步的機制寫入mysql

def __init__(self):

self.conn = pymysql.connect(

'127.0.0.1',

'root',

'rootkeys',

'article',

charset="utf8",

use_unicode=true)

self.cursor = self.conn.cursor()

def process_item(self, item, spider):

insert_sql = """

insert into huanhaichenfu(url, title, text)

values (%s, %s, %s)

"""# 使用values實現傳值

self.cursor.execute(

insert_sql,

(item["url"],

item["title"],

item["text"]))

self.conn.commit()

第六步:配置setting.py

# -*- coding: utf-8 -*-

bot_name = 'mycrawl'

spider_modules = ['mycrawl.spiders']

newspider_module = 'mycrawl.spiders'

robotstxt_obey = false

item_pipelines =

第七步:執行爬蟲

import os

import sys

from scrapy.cmdline import execute

run_spider = 'firstspider'

if __name__ == '__main__':

print('running spider of ' + run_spider)

execute(['scrapy', 'crawl', run_spider])