beijing.py
beijing.py
# -*- coding: utf-8 -*-
import scrapy
from zufang.items import ZufangItem
class BeijingSpider(scrapy.Spider):
    name = "beijing"
    allowed_domains = ["58.com"]
    start_urls = ['http://bj.58.com/chuzu/']
    def parse(self, response):
        item = ZufangItem()
        for i in response.css('.des'):
            item['title'] = ''.join(i.css('h2>a::text').extract()).encode('utf-8').strip()
            item['weburl'] = i.css('h2>a::attr(href)').extract_first()
            item['addres'] = i.css('.add>a:first-child::text').extract_first()
            pending_str = i.css('.room::text').extract_first()
            try:
                item['roomtype'] = pending_str.split()[0]
                item['size'] = pending_str.split()[1]
            except:
                item['roomtype'] = ''
                item['size'] = ''
            x = i.css('.jjr')
            if len(x) == 0:
                item['contacts'] = ''
            else:
                shopname =  x.css('span>span::text').extract_first().strip()
                person = x.css('.listjjr>a::text').extract_first()
                item['contacts'] = person
                item['company'] = shopname
            item['price'] = i.xpath('following-sibling::div[@class="listliright"]/div[@class="money"]/b/text()').extract_first()
            yield item
        next_page = response.css('.next::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(next_page,callback=self.parse)
items.py
items.py
import scrapy
class ZufangItem(scrapy.Item):
    title = scrapy.Field()
    addres = scrapy.Field()
    size = scrapy.Field()
    roomtype = scrapy.Field()
    contacts = scrapy.Field()
    price = scrapy.Field()
    company = scrapy.Field()
    weburl = scrapy.Field()
pipelines.py
pipelies.py
import codecs
import json
class ZufangPipeline(object):
    def __init__(self):
        self.file = codecs.open('zufang.json','wb',encoding='utf-8')
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + '\n'
        self.file.write(line.decode('unicode_escape'))
        return item
        Recommended Posts