I have the following data extract problem:
The spider extracts only first item from every page. There are over a dozen downloadable properties on each site.
When I delete item_loader.default_output_processor = TakeFirst ()
the spider extracts all the correct data but it puts all the data together…
Can anyone help how to solve it?
Here is the code:
spider.py
[...]
class RealEstateSpider(scrapy.Spider):
name = 'realestates'
start_urls = [
'www.webtoscrape.com'
]
def parse(self, response):
item_loader = ItemLoader(item=REItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
item_loader.default_output_processor = TakeFirst()
item_loader.add_css("price", ".offer-item-price")
item_loader.add_css("size", ".offer-item-area")
item_loader.add_css("rooms", ".offer-item-rooms")
yield item_loader.load_item()
next_page = response.css('.pager-next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
items.py
[...]
class REItem(scrapy.Item):
price = scrapy.Field(
input_processor=MapCompose(remove_tags, lambda value: float(value.replace("
", "").replace(" ", "").replace(",", ".")))
)
size = scrapy.Field(
input_processor=MapCompose(remove_tags, lambda value: float(value.replace("
", "").replace(" ", "").replace(",", ".")))
)
rooms = scrapy.Field(
input_processor=MapCompose(remove_tags, lambda value: int(value.replace(">", "").replace("
", "").replace(" ", "")))
)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…