Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
5f865616
Commit
5f865616
authored
7 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
backwards tracker
parent
2e1d39ff
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
98 additions
and
0 deletions
+98
-0
backwards_tracker.py
crawler_script/backwards_tracker.py
+98
-0
No files found.
crawler_script/backwards_tracker.py
0 → 100644
View file @
5f865616
#!/usr/bin/python
# -*- coding: utf-8 -*-
import
sys
import
json
import
os
import
datetime
"""
Script para la descarga de histórico de medios del tipo "descarga_hacia_atras".
"""
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/virtualHDD/m3/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
os
.
chdir
(
baseDir
)
for
s
in
siteList
:
media
=
s
[
'crawler'
][
s
[
'crawler'
]
.
rfind
(
"/"
)
+
1
:]
try
:
os
.
makedirs
(
media
)
except
:
print
"ok"
os
.
chdir
(
media
)
lstYears
=
os
.
listdir
(
"."
)
lstYears
.
sort
()
if
len
(
lstYears
)
>
0
:
year
=
int
(
lstYears
[
len
(
lstYears
)
-
1
])
else
:
year
=
today
.
date
()
.
year
print
year
try
:
os
.
makedirs
(
str
(
year
))
except
:
print
"ok"
os
.
chdir
(
str
(
year
))
lstDays
=
os
.
listdir
(
"."
)
lstDays
=
[
l
for
l
in
lstDays
if
not
l
.
startswith
(
'.'
)]
lstDays
.
sort
()
print
lstDays
filename
=
"noticias.json"
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays)-1]
# print strDate
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
#
# else:
scrapycommand
=
"scrapy crawl noticias --nolog -s filename="
+
filename
mydir
=
os
.
getcwd
()
print
mydir
os
.
chdir
(
scrapyDir
+
s
[
'crawler'
])
print
media
print
scrapycommand
os
.
system
(
scrapycommand
)
fileSize
=
os
.
stat
(
filename
)
.
st_size
if
fileSize
<=
3
:
os
.
system
(
"rm "
+
filename
)
else
:
os
.
chdir
(
scrapyDir
)
os
.
system
(
"python3 parse_date_files.py "
+
s
[
'crawler'
])
os
.
chdir
(
media
)
mediaYears
=
os
.
listdir
(
"."
)
mediaYears
.
sort
()
for
yy
in
mediaYears
:
os
.
chdir
(
yy
)
try
:
os
.
makedirs
(
baseDir
+
media
+
"/"
+
yy
)
except
:
pass
mediaDays
=
os
.
listdir
(
"."
)
mediaDays
=
[
l
for
l
in
mediaDays
if
not
l
.
startswith
(
'.'
)]
mediaDays
.
sort
()
for
dd
in
mediaDays
:
os
.
system
(
"mv "
+
dd
+
" "
+
baseDir
+
media
+
"/"
+
yy
)
os
.
chdir
(
".."
)
os
.
system
(
"rm -R "
+
yy
)
os
.
chdir
(
".."
)
os
.
system
(
"rm -R "
+
media
)
os
.
chdir
(
s
[
'crawler'
])
os
.
system
(
"rm "
+
filename
)
os
.
chdir
(
mydir
)
os
.
chdir
(
".."
)
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment