animalPolitico

parent 1a13f0d7
......@@ -4,34 +4,38 @@ import datetime
import glob
import json
import os
#===============================================================================
#===============================================================================
def main(argv):
'''
Main function to process directories and run crawlers.
'''
cwd = os.getcwd()
if len(argv) != 2 and len(argv) != 3:
print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]")
else:
rootdir = argv[1]
with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile:
# Ensure the log directory exists
if not os.path.exists(rootdir):
os.makedirs(rootdir)
log_filename = rootdir + "/" + datetime.datetime.today().strftime('%Y-%m-%d') + ".log"
with open(log_filename, "a") as logfile:
for path in sorted(glob.glob(f'{rootdir}/*/')):
print(path)
if not os.path.exists(path+'settings.json'):
logfile.write("\t configuration file <"+path+"settings.json> not found.\n")
if not os.path.exists(path + 'settings.json'):
logfile.write("\t configuration file <" + path + "settings.json> not found.\n")
continue
with open(path+'settings.json') as json_file:
with open(path + 'settings.json') as json_file:
cfg = json.load(json_file)
print(cfg)
if not cfg["enabled"]:
continue
logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
logfile.write("Crawler " + cfg["crawler"] + " started at: " +
datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S") + "\n")
new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
new_cwd = os.getcwd() + "/spiders/" + cfg["crawler"]
if os.path.exists(new_cwd):
os.chdir(new_cwd)
os.system("python3 ../../../scripts/siteCrawler.py " + path)
......@@ -39,6 +43,7 @@ def main(argv):
logfile.write("\t spider not found.\n")
print(os.getcwd())
os.chdir(cwd)
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv)
......@@ -84,12 +84,10 @@ class QuotesSpider(scrapy.Spider):
self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
self.uri_page = "%22}&type=page&page="
self.uri_complement = "&size=10"
print(self.uri_base+self.uri_page+self.uri_complement)
for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
print(response.url)
searchData = ImportantData()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment