#!/usr/bin/python3 import sys import datetime import glob import json import os #=============================================================================== def main(argv): ''' Main function to process directories and run crawlers. ''' cwd = os.getcwd() if len(argv) != 2 and len(argv) != 3: print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]") sys.exit(1) rootdir = argv[1] # Ensure the log directory exists if not os.path.exists(rootdir): os.makedirs(rootdir) log_filename = os.path.join(rootdir, datetime.datetime.today().strftime('%Y-%m-%d') + ".log") with open(log_filename, "a") as logfile: for path in sorted(glob.glob(os.path.join(rootdir, '*/'))): print(path) if not os.path.exists(os.path.join(path, 'settings.json')): logfile.write(f"\t configuration file <{path}settings.json> not found.\n") continue with open(os.path.join(path, 'settings.json')) as json_file: cfg = json.load(json_file) print(cfg) if not cfg["enabled"]: continue logfile.write(f"Crawler {cfg['crawler']} started at: {datetime.datetime.now().strftime('%Y-%m-%d, %H:%M:%S')}\n") new_cwd = os.path.join("/home/debian/crawlersNoticias/spiders/", cfg["crawler"]) if os.path.exists(new_cwd): os.chdir(new_cwd) try: os.system("python3 ../../../scripts/siteCrawler.py " + path) except Exception as e: logfile.write(f"\t Error executing siteCrawler.py: {str(e)}\n") finally: os.chdir(cwd) else: logfile.write("\t spider not found.\n") #------------------------------------------------------------------------------- if __name__ == "__main__": main(sys.argv)