Commit 019bf30e authored by Debian's avatar Debian

cron

parent 53872c19
#!/usr/bin/python3
import sys
import datetime
import glob
import json
import os
#===============================================================================
def main(argv):
'''
Main function to process directories and run crawlers.
'''
cwd = os.getcwd()
if len(argv) != 2 and len(argv) != 3:
print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]")
sys.exit(1)
rootdir = argv[1]
# Ensure the log directory exists
if not os.path.exists(rootdir):
os.makedirs(rootdir)
log_filename = os.path.join(rootdir, datetime.datetime.today().strftime('%Y-%m-%d') + ".log")
with open(log_filename, "a") as logfile:
for path in sorted(glob.glob(os.path.join(rootdir, '*/'))):
print(path)
if not os.path.exists(os.path.join(path, 'settings.json')):
logfile.write(f"\t configuration file <{path}settings.json> not found.\n")
continue
with open(os.path.join(path, 'settings.json')) as json_file:
cfg = json.load(json_file)
print(cfg)
if not cfg["enabled"]:
continue
logfile.write(f"Crawler {cfg['crawler']} started at: {datetime.datetime.now().strftime('%Y-%m-%d, %H:%M:%S')}\n")
new_cwd = os.path.join("/home/debian/crawlersNoticias/spiders/", cfg["crawler"])
if os.path.exists(new_cwd):
os.chdir(new_cwd)
try:
os.system("python3 ../../../scripts/siteCrawler.py " + path)
except Exception as e:
logfile.write(f"\t Error executing siteCrawler.py: {str(e)}\n")
finally:
os.chdir(cwd)
else:
logfile.write("\t spider not found.\n")
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv)
python3 /home/debian/crawlersNoticias/scripts/crawlAll2.py /data/m3/news
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment