Scrapy Python script to crawl darkweb/onion websites.

Scrapy is a fast, open-source web crawling and data extraction framework written in Python, widely used for web scraping and mining structured data from websites. Unlike simple scraping libraries, Scrapy is a comprehensive framework that manages every aspect of the scraping process—from sending HTTP requests and handling responses to processing, cleaning, and storing the extracted data in formats such as JSON, CSV, or XML. In the example below, we have a Python scrapy script that crawls darkweb onion websites.

# Create a scrapy project first
# scrapy startproject myproject
# cd myproject
# scrap

import socks
import socket
from elasticsearch import Elasticsearch
import redis
from onionscan import Onion

class CombinedSpider(scrapy.Spider):
name = 'combinedspider'
start_urls = ['https://www.example.com/', 'http://exampleonion.onion/']


def __init__(self, *args, **kwargs):
    # Set up TOR proxy
    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
    socket.socket = socks.socksocket
    # Set up Elasticsearch client
    self.es = Elasticsearch(['http://localhost:9200'])
    # Set up Redis client
    self.redis = redis.Redis(host='localhost', port=6379, db=0)
    # Set up OnionScan instance
    self.onion = Onion()
    super(CombinedSpider, self).__init__(*args, **kwargs)

    def parse(self, response):
        if response.url.startswith('http://exampleonion.onion/'):
            # Scan the Onion Service
            report = self.onion.scan(response.url)
            # Extract data from report
            data = {
                'summary': report.summary(),
                'services': report.services(),
                'links': report.links(),
            }
            # Index data in Elasticsearch
            self.es.index(index='onionindex', doc_type='oniontype', body=data)
            # Store data in Redis cache
            self.redis.set('last_scanned_onion', data)
        else:
            # Extract data from page
            data = {
                'title': response.css('h1::text').get(),
                'body': response.css('p').get(),
            }
            # Index data in Elasticsearch
            self.es.index(index='myindex', doc_type='mytype', body=data)
            # Store data in Redis cache
            self.redis.set('last_crawled_page', data)

            # Follow links to other pages
            for href in response.css('a::attr(href)'):
                yield response.follow(href, self.parse)