# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from collections import OrderedDict class JsonWriterPipeline(object): def __init__(self, filename): self.filename = filename @classmethod def from_crawler(cls, crawler): # Here you get whatever value was passed through the "filename" command line parameter settings = crawler.settings filename = settings.get('filename') # Instantiate the pipeline with the file name return cls(filename) def open_spider(self, spider): self.counter = 0 self.file = open(self.filename, 'w') self.file.write("[") def close_spider(self, spider): self.file.write("]") self.file.close() def process_item(self, item, spider): # print("this is my item", item) row = [] try: row.append(("date", item['date'])) except: pass try: row.append(("topic", item['topic'])) except: pass try: row.append(("title", item['title'])) except: pass try: row.append(("author", item['author'])) except: pass try: row.append(("location", item['location'])) except: pass try: row.append(("text", item['text'])) except: pass try: row.append(("url", item['url'])) except: pass line = OrderedDict(row) self.counter += 1 if self.counter == 1: self.file.write(json.dumps(line)) elif self.counter > 1: self.file.write(",\n" + json.dumps(line)) return item