1 #!/usr/bin/env python 2 # -*- encoding: utf-8 -*- 3 # Created on 2017-06-11 10:10:53 4 # Project: london 5 6 from pyspider.libs.base_handler import * 7 import pymongo 8 9 10 class Handler(BaseHandler):11 crawl_config = {12 }13 client = pymongo.MongoClient('localhost')14 db = client['trip']15 16 @every(minutes=24 * 60)17 def on_start(self):18 self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-London_England.html', callback=self.index_page)19 20 @config(age=10 * 24 * 60 * 60)21 def index_page(self, response):22 for each in response.doc('.listing_title > a').items():23 self.crawl(each.attr.href, callback=self.detail_page)24 next_page = response.doc('.pagination .nav.next').attr.href25 self.crawl(next_page,callback = self.index_page)26 27 @config(priority=2)28 def detail_page(self, response):29 return {30 "name":response.doc('h1').text(),31 "url": response.url,32 'comment':response.doc('.heading_ratings .taLnk').text(),33 'address':response.doc('.addressReset > span.format_address').text(),34 'phone':response.doc('.phoneNumber').text(),35 'duration':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(5) > div > div:nth-child(1)').text(),36 'instruction':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(6) > div > b').text()37 }38 def on_result(self,result):39 if result:40 self.save_to_mongo(result)41 42 def save_to_mongo(self,result):43 if self.db['london'].insert(result):44 print('saved to mongo',result)45