Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
a79008f2
Commit
a79008f2
authored
Aug 03, 2017
by
Mario Chirinos Colunga
💬
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
script
parent
a2cb46ab
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
113 additions
and
5 deletions
+113
-5
crawl_all.py
crawler_script/crawl_all.py
+69
-5
crawl_all.sh
crawler_script/crawl_all.sh
+3
-0
crawler_data.json
crawler_script/crawler_data.json
+26
-0
2017-01-06.json
sitios_yucatan/alChile/2017-01-06.json
+15
-0
__init__.pyc
sitios_yucatan/alChile/alChile/__init__.pyc
+0
-0
settings.pyc
sitios_yucatan/alChile/alChile/settings.pyc
+0
-0
__init__.pyc
sitios_yucatan/alChile/alChile/spiders/__init__.pyc
+0
-0
noticias.pyc
sitios_yucatan/alChile/alChile/spiders/noticias.pyc
+0
-0
No files found.
crawler_script/crawl_all.py
View file @
a79008f2
...
...
@@ -2,11 +2,75 @@
# -*- coding: utf-8 -*-
import
sys
import
json
baseDir
=
"/home/virtualHDD/m3/noticias/"
import
os
import
datetime
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/virtualHDD/m3/noticias/"
scrapyDir
=
"/home/geoint/cawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
print
siteList
os
.
chdir
(
baseDir
)
for
s
in
siteList
:
desde
=
datetime
.
datetime
.
strptime
(
s
[
'desde'
],
'
%
d-
%
m-
%
Y'
)
print
str
(
s
[
'nombre'
]
+
", desde:"
+
desde
.
strftime
(
"
%
Y-
%
m-
%
d"
))
media
=
s
[
'crawler'
][
s
[
'crawler'
]
.
find
(
"/"
)
+
1
:]
try
:
os
.
makedirs
(
media
)
except
:
print
"ok"
os
.
chdir
(
media
)
lstYears
=
os
.
listdir
(
"."
)
lstYears
.
sort
()
year
=
desde
.
year
if
len
(
lstYears
)
>
0
:
year
=
int
(
lstYears
[
len
(
lstYears
)
-
1
])
for
y
in
range
(
year
,
today
.
year
+
1
):
print
y
try
:
os
.
makedirs
(
str
(
y
))
except
:
print
"ok"
os
.
chdir
(
str
(
y
))
# print os.getcwd()
lstDays
=
os
.
listdir
(
"."
)
lstDays
=
[
l
for
l
in
lstDays
if
not
l
.
startswith
(
'.'
)]
lstDays
.
sort
()
print
lstDays
day
=
desde
.
timetuple
()
.
tm_yday
print
day
currentDate
=
desde
if
len
(
lstDays
)
>
0
:
strDate
=
lstDays
[
len
(
lstDays
)
-
1
]
strDate
=
strDate
[:
strDate
.
find
(
"."
)]
currentDate
=
datetime
.
datetime
.
strptime
(
strDate
,
'
%
Y-
%
m-
%
d'
)
day
=
currentDate
.
timetuple
()
.
tm_yday
elif
y
!=
desde
.
year
:
currentDate
=
datetime
.
datetime
.
strptime
(
str
(
y
)
+
"-01-01"
,
'
%
Y-
%
m-
%
d'
)
day
=
1
for
d
in
range
(
day
,
365
+
1
):
filename
=
currentDate
.
strftime
(
'
%
Y-
%
m-
%
d'
)
+
".json"
scrapycommand
=
"scrapy crawl noticias -t json -o "
+
filename
+
" -a year="
+
str
(
currentDate
.
year
)
+
" -a month="
+
str
(
currentDate
.
month
)
+
" -a day="
+
str
(
currentDate
.
day
)
mydir
=
os
.
getcwd
()
print
mydir
os
.
chdir
(
scrapyDir
+
s
[
'crawler'
])
print
scrapycommand
os
.
system
(
scrapycommand
)
os
.
system
(
"mv "
+
filename
+
" "
+
mydir
)
os
.
chdir
(
mydir
)
currentDate
=
currentDate
+
datetime
.
timedelta
(
days
=
1
)
os
.
chdir
(
".."
)
os
.
chdir
(
".."
)
print
today
.
year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
crawler_script/crawl_all.sh
View file @
a79008f2
#!/bin/bash
python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
crawler_script/crawler_data.json
0 → 100644
View file @
a79008f2
[
{
"nombre"
:
"alChile"
,
"crawler"
:
"sitios_yucatan/alChile"
},
{
"nombre"
:
"desdeElBalcon"
,
"crawler"
:
"sitios_yucatan/desdeElBalcon"
},
{
"nombre"
:
"diarioYucatan"
,
"crawler"
:
"sitios_yucatan/diarioYucatan"
},
{
"nombre"
:
"grilloPorteno"
,
"crawler"
:
"sitios_yucatan/grilloPorteno"
},
{
"nombre"
:
"laJornadaMaya"
,
"crawler"
:
"sitios_yucatan/alChile"
},
{
"nombre"
:
"laVerdadYuc"
,
"crawler"
:
"sitios_yucatan/laVerdadYuc"
},
{
"nombre"
:
"lectorMX"
,
"crawler"
:
"sitios_yucatan/lectorMX"
},
{
"nombre"
:
"miPuntoDeVista"
,
"crawler"
:
"sitios_yucatan/miPuntoDeVista"
},
{
"nombre"
:
"notirivas"
,
"crawler"
:
"sitios_yucatan/notirivas"
},
{
"nombre"
:
"notisureste"
,
"crawler"
:
"sitios_yucatan/notisureste"
},
{
"nombre"
:
"puntoMedio"
,
"crawler"
:
"sitios_yucatan/puntoMedio"
},
{
"nombre"
:
"sona893"
,
"crawler"
:
"sitios_yucatan/sona893"
},
{
"nombre"
:
"yucatanALaMano"
,
"crawler"
:
"sitios_yucatan/yucatanALaMano"
},
{
"nombre"
:
"yucatanAlMinuto"
,
"crawler"
:
"sitios_yucatan/yucatanAlMinuto"
},
{
"nombre"
:
"yucatanEnCorto"
,
"crawler"
:
"sitios_yucatan/yucatanEnCorto"
},
{
"nombre"
:
"diarioYaqui"
,
"crawler"
:
"otros_sitios/diarioYaqui"
},
{
"nombre"
:
"laJornada"
,
"crawler"
:
"otros_sitios/laJornada"
},
{
"nombre"
:
"laJornadaAgs"
,
"crawler"
:
"otros_sitios/laJornadaAgs"
},
{
"nombre"
:
"laJornadaBC"
,
"crawler"
:
"otros_sitios/laJornadaBC"
},
{
"nombre"
:
"laJornadaGro"
,
"crawler"
:
"otros_sitios/laJornadaGro"
},
{
"nombre"
:
"laJornadaOte"
,
"crawler"
:
"otros_sitios/laJornadaOte"
},
{
"nombre"
:
"laJornadaSanLuis"
,
"crawler"
:
"otros_sitios/laJornadaSanLuis"
},
{
"nombre"
:
"laJornadaVer"
,
"crawler"
:
"otros_sitios/laJornadaVer"
},
{
"nombre"
:
"laJornadaZac"
,
"crawler"
:
"otros_sitios/laJornadaZac"
}
]
\ No newline at end of file
sitios_yucatan/alChile/2017-01-06.json
0 → 100644
View file @
a79008f2
This diff is collapsed.
Click to expand it.
sitios_yucatan/alChile/alChile/__init__.pyc
View file @
a79008f2
No preview for this file type
sitios_yucatan/alChile/alChile/settings.pyc
View file @
a79008f2
No preview for this file type
sitios_yucatan/alChile/alChile/spiders/__init__.pyc
View file @
a79008f2
No preview for this file type
sitios_yucatan/alChile/alChile/spiders/noticias.pyc
View file @
a79008f2
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment