animalPolitico

parent 1a13f0d7
...@@ -4,41 +4,46 @@ import datetime ...@@ -4,41 +4,46 @@ import datetime
import glob import glob
import json import json
import os import os
#===============================================================================
#=============================================================================== #===============================================================================
def main(argv): def main(argv):
''' '''
Main function to process directories and run crawlers.
'''
cwd = os.getcwd()
if len(argv) != 2 and len(argv) != 3:
print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]")
else:
rootdir = argv[1]
# Ensure the log directory exists
if not os.path.exists(rootdir):
os.makedirs(rootdir)
log_filename = rootdir + "/" + datetime.datetime.today().strftime('%Y-%m-%d') + ".log"
with open(log_filename, "a") as logfile:
for path in sorted(glob.glob(f'{rootdir}/*/')):
print(path)
if not os.path.exists(path + 'settings.json'):
logfile.write("\t configuration file <" + path + "settings.json> not found.\n")
continue
with open(path + 'settings.json') as json_file:
cfg = json.load(json_file)
print(cfg)
if not cfg["enabled"]:
continue
logfile.write("Crawler " + cfg["crawler"] + " started at: " +
datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S") + "\n")
''' new_cwd = os.getcwd() + "/spiders/" + cfg["crawler"]
cwd = os.getcwd() if os.path.exists(new_cwd):
if len(argv) != 2 and len(argv) != 3: os.chdir(new_cwd)
print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]") os.system("python3 ../../../scripts/siteCrawler.py " + path)
else: else:
rootdir = argv[1] logfile.write("\t spider not found.\n")
with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile: print(os.getcwd())
for path in sorted(glob.glob(f'{rootdir}/*/')): os.chdir(cwd)
print(path)
if not os.path.exists(path+'settings.json'):
logfile.write("\t configuration file <"+path+"settings.json> not found.\n")
continue
with open(path+'settings.json') as json_file:
cfg = json.load(json_file)
print(cfg)
if not cfg["enabled"]:
continue
logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
if os.path.exists(new_cwd):
os.chdir(new_cwd)
os.system("python3 ../../../scripts/siteCrawler.py " + path)
else:
logfile.write("\t spider not found.\n")
print(os.getcwd())
os.chdir(cwd)
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv) main(sys.argv)
...@@ -84,14 +84,12 @@ class QuotesSpider(scrapy.Spider): ...@@ -84,14 +84,12 @@ class QuotesSpider(scrapy.Spider):
self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22" self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
self.uri_page = "%22}&type=page&page=" self.uri_page = "%22}&type=page&page="
self.uri_complement = "&size=10" self.uri_complement = "&size=10"
print(self.uri_base+self.uri_page+self.uri_complement)
print(self.uri_base+self.uri_page+self.uri_complement)
for s in sectionList: for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse) yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response): def parse(self, response):
print(response.url) print(response.url)
searchData = ImportantData() searchData = ImportantData()
CONTINUE_SEARCHING = True CONTINUE_SEARCHING = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment