记录 Django、Scrapy 整合教程。
目录
Requirements
Django 3.1.4
Scrapy 2.4.1
MySQL 5.6+
创建项目 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 pipx install Django==3.1.4 -i https://mirrors.aliyun.com/pypi/simple/ django-admin startproject $myproject cd $myproject pipenv --python 3.6.9 vim Pipfile pipenv install django==3.1.4 pipenv shell python manage.py startapp backend exit pipenv install scrapy==2.4.1 -i https://mirrors.aliyun.com/pypi/simple/ scrapy startproject crawlend cd crawlendscrapy genspider douban douban.com
为了方便部署,我们把 scrapy.cfg 以及同级的 crawlend 目录移动到根目录来!(最外层的 crawlend 不要了)
目录结构可以参考这篇文章
准备数据库
docker-compose.yml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 version: '3.3' services: mysql: image: mysql:5.6 volumes: - ./mysql:/var/lib/mysql restart: always ports: - '3306:3306' environment: MYSQL_ROOT_PASSWORD: some_django_app command: ['--character-set-server=utf8mb4' , '--collation-server=utf8mb4_unicode_ci' , '--default-authentication-plugin=mysql_native_password' ] container_name: djapp-mysql redis: image: redis:latest restart: always ports: - '6379:6379' container_name: djapp-redis phpadmin: links: - "mysql:db" image: phpmyadmin/phpmyadmin:latest restart: 'no' ports: - '8080:80' container_name: phpadmin
修改默认代码
1 pipenv install django-environ pymysql
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 INSTALLED_APPS = [ 'backend' , ... ] ALLOWED_HOSTS = ['*' ] DATABASES = { 'default' : { 'ENGINE' : 'django.db.backends.mysql' , 'NAME' : env('MYSQL_DBNAME' ), 'USER' : env('MYSQL_USER' ), 'PASSWORD' : env('MYSQL_PASSWORD' ), 'HOST' : env('MYSQL_HOST' ), 'PORT' : env('MYSQL_PORT' ), 'OPTIONS' : { 'autocommit' : True , }, } } LANGUAGE_CODE = 'zh-hans' TIME_ZONE = 'Asia/Shanghai' USE_TZ = False
1 2 3 4 5 6 7 8 ... from django.urls import path, includeurlpatterns = [ path('admin/' , admin.site.urls), path('main/' , include(('backend.urls' , 'backend' ), namespace='backend' )), ]
1 2 3 4 5 6 7 from django.conf.urls import urlfrom . import viewsurlpatterns = [ url(r'^$' , views.index, name='index' ), url(r'api/' , views.api, name='api' ) ]
1 2 3 4 5 6 7 8 9 from django.shortcuts import renderfrom django.http import HttpResponsedef index (request ): return HttpResponse('hello world' ) def api (request ): return HttpResponse('hello api' )
测试服务 1 2 3 4 pipenv shell python manage.py migrate python manage.py createsuperuser python manage.py runserver
如果提示没有数据库 django_app 则需先通过 phpmyadmin 创建表。
修改model 根据内容需求定义 model,如 豆瓣电影排行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from django.db import modelsclass Movie (models.Model): """ 电影表 """ def __str__ (self ): return "%s" %(self.title) uid = models.CharField('唯一ID' , max_length=20 , unique=True , db_index=True ) title = models.CharField('标题' , max_length=200 ) release = models.CharField('上映时间' , max_length=20 ) url = models.CharField('链接' , max_length=700 )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import pymysql, sys, loggingpymysql.version_info = (1 , 4 , 2 , "final" , 0 ) pymysql.install_as_MySQLdb() from django.db import connectionsfrom django.db.utils import OperationalErrordb_conn = connections['default' ] try : c = db_conn.cursor() except OperationalError: logging.error('db connect error!' ) sys.exit(1 ) else : c.close()
修改 model 后需要 migrate
1 2 python manage.py makemigrations python manage.py migrate
修改crawlend
增加 crawlend/crawlend/state.py 存储状态
增加 crawlend/crawlend/helper.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 import osimport sysimport djangoimport loggingimport datetimedef beijing (sec, what ): beijing_time = datetime.datetime.now() + datetime.timedelta(hours=8 ) return beijing_time.timetuple() class CrawerHelper (object ): def init_app (self ): self.load_django_app() logging.Formatter.converter = beijing def load_django_app (self ): BASE_DIR = os.path.dirname(os.path.dirname( os.path.dirname(os.path.abspath(__file__)))) sys.path.append(BASE_DIR) os.environ.setdefault("DJANGO_SETTINGS_MODULE" , "djcrawler.settings" ) django.setup() def get_current_page_from_db (self, uid ): from backend.models import CrawlState try : crawlstate = CrawlState.objects.get(uid=uid) print (112 , crawlstate.current_page) return crawlstate.current_page except CrawlState.DoesNotExist: return '' def save_current_page_to_db (self, uid, name, page ): from backend.models import CrawlState crawlstate, created = CrawlState.objects.get_or_create(uid=uid) if created: crawlstate.crawler_name = name crawlstate.current_page = page crawlstate.save() else : crawlstate.current_page = page crawlstate.save()
修改 crawlend/crawlend/settings.py: 在 setting(初始化)中加载django应用、并增加 pipelines
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from .helper import CrawerHelperCrawerHelper().init_app() USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 5 RANDOMIZE_DOWNLOAD_DELAY = True ITEM_PIPELINES = { 'crawlend.pipelines.CrawlendPipeline' : 200 , 'crawlend.pipelines.InsertDBPipeline' : 300 , }
修改 crawlend/crawlend/items.py
1 2 3 4 5 6 7 8 9 import scrapyclass CrawlendItem (scrapy.Item): movie_uid = scrapy.Field() movie_title = scrapy.Field() movie_release = scrapy.Field() movie_url = scrapy.Field() pass
修改 crawlend/crawlend/pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 from itemadapter import ItemAdapterimport sys, django, logging, tracebackfrom crawlend import statefrom backend.models import *class CrawlendPipeline : def process_item (self, item, spider ): return item class InsertDBPipeline (object ): def process_item (self, item, spider ): try : movie = Movie(uid=item['movie_uid' ], title=item['movie_title' ], release=item['movie_release' ], url=item['movie_url' ]) logging.info('starting save... ' + item['movie_title' ]) movie.save() except django.db.utils.IntegrityError: logging.warning('Exist repeat item! ' + item['movie_uid' ]) state.duplicate_count += 1 except Exception as e: traceback.print_exc() sys.exit(1 ) pass
修改 crawlend/crawlend/spiders/$site.py
注释掉 allowed_domains
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 class DoubanSpider (scrapy.Spider): name = 'douban' url = 'https://movie.douban.com/chart' headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' , 'Origin' : 'https://movie.douban.com' , 'Host' : 'movie.douban.com' , 'Referer' : 'https://movie.douban.com' } def start_requests (self ): start_url = self.url logging.warn('ready to get: ' + start_url) yield scrapy.Request(url=start_url, headers=self.headers, callback=self.parse) def parse (self, response ): res_list = response.css('.indent table' ) for record in res_list: item = CrawlendItem() href = record.css('.pl2 a::attr(href)' ).get(default='' ).strip() href_splited = list (filter (None , href.split('/' ))) title = record.css('.pl2 a::text' ).get(default='' ).strip() if title.endswith('/' ): title = title[:-1 ].strip() item['movie_uid' ] = href_splited[-1 ] item['movie_title' ] = title item['movie_release' ] = 'test' item['movie_url' ] = href yield item
爬取多页的逻辑:
1 2 3 4 5 6 7 8 9 10 11 def parse (self, response ): if self.current_page < self.last_page: self.current_page = self.current_page + 1 next_page = self.url + str (self.current_page) logging.warning('ready to fetch: ' + next_page) yield scrapy.Request(url=next_page, headers=self.headers, callback=self.parse)
测试运行scrapy 1 2 3 4 cd crawlendscrapy list scrapy crawl douban
修改admin
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from django.contrib import adminfrom .models import Movieclass MovieAdmin (admin.ModelAdmin): '''设置列表可显示的字段''' list_display = ('uid' , 'title' , 'release' , 'url' ) '''设置过滤选项''' list_filter = ('release' ,) '''每页显示条目数''' list_per_page = 5 '''设置可编辑字段''' list_editable = ('title' ,) admin.site.register(Movie, MovieAdmin)
评论