Skip to content

Scrapy

Tools

Spider

python
import scrapy


class SampleSpider(scrapy.Spider):
    name = "sample"

    start_urls = ["a", "b", "c"]

    def parse(self, response):
        # parse here

        yield i

Override / add settings

python
custom_settings = {
    "SOME_SETTING": "some value",
}

Make requests

GET

python
yield scrapy.Request(
    url=url, cookies=cookies, callback=self.add_coordinates, meta={"data": j}
)
python
yield scrapy.FormRequest(
    url=url,
    method="GET",
    dont_filter=True,
    formdata=payload,
    meta={"facetFilters": facetFilter, "numericFilters": numericFilter},
    callback=self.get_each_page,
)

POST

python
yield scrapy.Request(
    url=start_url,
    method="POST",
    body=json.dumps(payload),
    headers={"Content-Type": "application/json"},
    callback=self.get_cities,
)
python
yield scrapy.FormRequest(
    "api.example.com", callback=self.parse, method="POST", formdata=params
)

Pipelines

Write as single-line JSON

python
# pipelines.py

import json

from itemadapter import ItemAdapter


class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open(f"{spider.name}.jl", "w")

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(ItemAdapter(item).asdict(), default=str) + "\n"
        self.file.write(line)
        return item

Middleware

Custom downloader

python
# middleware.py

from scrapy.http import HtmlResponse

import cloudscraper


class ProjectDownloaderMiddleware:
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        # return None

        if spider.name == "spider_name":
            scraper = cloudscraper.create_scraper()
            r = scraper.get(request.url)
            body = r.content
            response = HtmlResponse(url=request.url, body=body)

            return response

shell

Set header for shell

bash
$ scrapy shell
>>> from scrapy import Request
>>> req = Request("yoururl.com", headers={"header1":"value1"})
>>> fetch(req)

Use local html file

bash
scrapy shell file:///path/to/file.html

Misc

python
# change response encoding
response.replace(encoding="utf-8")