#!/usr/bin/python3 # File: siteCrawler.py # Author: Mario Chirinos Colunga # Daily News Site Crawler #=============================================================================== import sys import os from pathlib import Path import datetime import json #=============================================================================== def findLastDate(directory): print(directory) path = Path(directory) dirs = [e.name for e in path.iterdir() if e.is_dir()] print("len:", len(dirs)) if len(dirs)>0: # dirs.sort() dirs= sorted(dirs, reverse=True) # i = -1 i=0 while i=0: path = Path(directory+dirs[i]) files = [e.name[:-5] for e in path.glob("*.json")] if len(files)>0: files.sort() date = datetime.datetime.strptime(files[-1], '%Y-%m-%d') return date # i-=1 i+=1 return None #=============================================================================== def crawlNews(directory, cfg, endDate=datetime.datetime.now()): ''' directory: directory's absolute path where json files are going to be stored, it must contain a ''' startDate = findLastDate(directory) if startDate is None: startDate=datetime.datetime.strptime(cfg["startDate"], '%Y-%m-%d') print("start: ", startDate, "end: ", endDate) delta = endDate-startDate print("Days to crawl:", delta) for i in range(delta.days + 1): day = startDate + datetime.timedelta(days=i) yeardir = directory+str(day.year)+"/" print(yeardir) if not os.path.exists(yeardir): os.mkdir(yeardir) print(day) print(os.getcwd()) print("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"") os.system("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"") #=============================================================================== def main(argv): ''' ''' if len(argv) != 2 and len(argv) != 3: print ("Usage: " + argv[0] + " [endDate:YYYY-MM-DD]") else: with open(argv[1]+'settings.json') as json_file: cfg = json.load(json_file) print(cfg) if len(argv)==2: crawlNews(argv[1], cfg) if len(argv)==3: crawlNews(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d')) if __name__ == "__main__": main(sys.argv)