#!/usr/bin/python3
# File: siteCrawler.py
# Author: Mario Chirinos Colunga
# Daily News Site Crawler
#===============================================================================
import sys
import os
from pathlib import Path
import datetime
import json
#===============================================================================
def findLastDate(directory):
	print(directory)
	path = Path(directory)

	dirs = [e.name for e in path.iterdir() if e.is_dir()]
	print("len:",  len(dirs))
	if len(dirs)>0:
#		dirs.sort()
		dirs= sorted(dirs, reverse=True)
#		i = -1
		i=0
		while i<len(dirs): #(len(dirs)+1)>=0:
			path = Path(directory+dirs[i])
			files = [e.name[:-5] for e in path.glob("*.json")]
			if len(files)>0:
				files.sort()
				date = datetime.datetime.strptime(files[-1], '%Y-%m-%d')
				return date
#			i-=1
			i+=1
	return None

#===============================================================================

def crawlNews(directory, cfg, endDate=datetime.datetime.now()):
	'''
		directory: directory's absolute path where json files are going to be stored, it must contain a 

	'''
	startDate = findLastDate(directory)

	if startDate is None:
		startDate=datetime.datetime.strptime(cfg["startDate"], '%Y-%m-%d')
	print("start: ", startDate, "end: ", endDate)
	delta = endDate-startDate
	print("Days to crawl:", delta)
	for i in range(delta.days + 1):
		day = startDate + datetime.timedelta(days=i)
		yeardir = directory+str(day.year)+"/"
		print(yeardir)
		if not os.path.exists(yeardir):
			os.mkdir(yeardir)
		print(day)
		print(os.getcwd())
		print("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"")
		os.system("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"")
#===============================================================================
def main(argv):
	'''


	'''
	if len(argv) != 2 and len(argv) != 3:
		print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
	else:
		with open(argv[1]+'settings.json') as json_file:
			cfg = json.load(json_file)
			print(cfg)
		if len(argv)==2:
			crawlNews(argv[1], cfg)
		if len(argv)==3:
			crawlNews(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d'))

if __name__ == "__main__":
	main(sys.argv)