Scrapy is a fast, open-source web crawling and data extraction framework written in Python, widely used for web scraping and mining structured data from websites. Unlike simple scraping libraries, Scrapy is a comprehensive framework that manages every aspect of the scraping process—from sending HTTP requests and handling responses to processing, cleaning, and storing the extracted data in formats such as JSON, CSV, or XML. In the example below, we have a Python scrapy script that crawls darkweb onion websites.
# Create a scrapy project first
# scrapy startproject myproject
# cd myproject
# scrap
import socks
import socket
from elasticsearch import Elasticsearch
import redis
from onionscan import Onion
class CombinedSpider(scrapy.Spider):
name = 'combinedspider'
start_urls = ['https://www.example.com/', 'http://exampleonion.onion/']
def __init__(self, *args, **kwargs):
# Set up TOR proxy
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
socket.socket = socks.socksocket
# Set up Elasticsearch client
self.es = Elasticsearch(['http://localhost:9200'])
# Set up Redis client
self.redis = redis.Redis(host='localhost', port=6379, db=0)
# Set up OnionScan instance
self.onion = Onion()
super(CombinedSpider, self).__init__(*args, **kwargs)
def parse(self, response):
if response.url.startswith('http://exampleonion.onion/'):
# Scan the Onion Service
report = self.onion.scan(response.url)
# Extract data from report
data = {
'summary': report.summary(),
'services': report.services(),
'links': report.links(),
}
# Index data in Elasticsearch
self.es.index(index='onionindex', doc_type='oniontype', body=data)
# Store data in Redis cache
self.redis.set('last_scanned_onion', data)
else:
# Extract data from page
data = {
'title': response.css('h1::text').get(),
'body': response.css('p').get(),
}
# Index data in Elasticsearch
self.es.index(index='myindex', doc_type='mytype', body=data)
# Store data in Redis cache
self.redis.set('last_crawled_page', data)
# Follow links to other pages
for href in response.css('a::attr(href)'):
yield response.follow(href, self.parse)