animalPolitico

a096f32c · Ulises Morales Ramírez · 1a13f0d7 · a096f32c · a096f32c
Commit a096f32c authored Jan 13, 2025 by Ulises Morales Ramírez
Show whitespace changes
Inline Side-by-side

Showing with 37 additions and 34 deletions

crawlAll.py scripts/crawlAll.py +35 -30

noticias.py ...s/backwards/elFinanciero/elFinanciero/spiders/noticias.py +2 -4

No files found.
--- a/scripts/crawlAll.py
+++ b/scripts/crawlAll.py
@@ -4,34 +4,38 @@ import datetime
 import glob
 import json
 import os
-#===============================================================================
-

 #===============================================================================
 def main(argv):
    '''
-
-
+    Main function to process directories and run crawlers.
    '''
    cwd = os.getcwd()
    if len(argv) != 2 and len(argv) != 3:
-		print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
+        print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]")
    else:
        rootdir = argv[1]
-		with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile:
+        
+        # Ensure the log directory exists
+        if not os.path.exists(rootdir):
+            os.makedirs(rootdir)
+
+        log_filename = rootdir + "/" + datetime.datetime.today().strftime('%Y-%m-%d') + ".log"
+        with open(log_filename, "a") as logfile:
            for path in sorted(glob.glob(f'{rootdir}/*/')):
                print(path)
-				if not os.path.exists(path+'settings.json'):
-					logfile.write("\t configuration file <"+path+"settings.json> not found.\n")
+                if not os.path.exists(path + 'settings.json'):
+                    logfile.write("\t configuration file <" + path + "settings.json> not found.\n")
                    continue
-				with open(path+'settings.json') as json_file:
+                with open(path + 'settings.json') as json_file:
                    cfg = json.load(json_file)
                    print(cfg)
                if not cfg["enabled"]:
                    continue
-				logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
+                logfile.write("Crawler " + cfg["crawler"] + " started at: " +
+                              datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S") + "\n")

-				new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
+                new_cwd = os.getcwd() + "/spiders/" + cfg["crawler"]
                if os.path.exists(new_cwd):
                    os.chdir(new_cwd)
                    os.system("python3 ../../../scripts/siteCrawler.py " + path)
@@ -39,6 +43,7 @@ def main(argv):
                    logfile.write("\t spider not found.\n")
                print(os.getcwd())
                os.chdir(cwd)
+
 #-------------------------------------------------------------------------------
 if __name__ == "__main__":
    main(sys.argv)
--- a/spiders/backwards/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/spiders/backwards/elFinanciero/elFinanciero/spiders/noticias.py
@@ -84,12 +84,10 @@ class QuotesSpider(scrapy.Spider):
        self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
        self.uri_page = "%22}&type=page&page="
        self.uri_complement = "&size=10"
-
        print(self.uri_base+self.uri_page+self.uri_complement)
        for s in sectionList:
            yield scrapy.Request(url=self.baseURL + s, callback=self.parse)

-
    def parse(self, response):
        print(response.url)
        searchData = ImportantData()