1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#vim: set fileencoding=<utf-8> :
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.webdriver.common.by import By
print sys.stdout.encoding
#browser = webdriver.Firefox() # Get local session of firefox
browser = webdriver.Chrome() # Get local session of chrome
browser.get("http://www.seriesyonkis.com/ultimos-capitulos.php") # Load page
series=[]
#extract all series
for elemento in browser.find_elements_by_class_name("leftmenugroup")[0].find_elements_by_xpath(".//a"):
series.append([elemento.get_attribute("href"),elemento.text])
print elemento.get_attribute("href") + " " + elemento.text
print series
#series.append(["http://www.seriesyonkis.com/serie/zen/","ZEN"])
#series.append(["http://www.seriesyonkis.com/serie/zoey-101/","Zoey 101"])
for serie in series:
#print "analizando " + serie.get_attribute("href") + " Serie " + serie.text
#print "analizando " + serie.text
capitulos=[]
if serie[1] <> "Series:" and serie[1] <> "películas aquí":
browser.get(serie[0])
for elemento in browser.find_elements_by_tag_name("h5"):
#print elemento.find_elements_by_xpath(".//a")[0].get_attribute("href")
#print elemento.find_elements_by_xpath(".//a")[0].text
capitulos.append([elemento.find_elements_by_xpath(".//a")[0].get_attribute("href"),elemento.find_elements_by_xpath(".//a")[0].text])
print capitulos
for capitulo in capitulos:
browser.get(capitulo[0])
rows=browser.find_elements_by_tag_name("tr")
links=[]
for row in rows:
#print row.text
for j in row.find_elements_by_xpath(".//a"):
#print "-----------------------------------------------------------------------------------------"
#print row.find_elements_by_xpath(".//td")[0].text + " 0 linea"
#print row.find_elements_by_xpath(".//td")[1].text + " 1 linea"
#print row.find_elements_by_xpath(".//td")[2].text + " idioma"
#print row.find_elements_by_xpath(".//td")[3].text + " subtitulos"
#print row.find_elements_by_xpath(".//td")[5].text + " informacion (duracion/formato/tamanyo)"
links.append([row.find_elements_by_xpath(".//td")[2].text,row.find_elements_by_xpath(".//td")[3].text,row.find_elements_by_xpath(".//td")[5].text,j.get_attribute("href")])
#print links
correctlinks=[]
for z in links:
browser.get(z[3])
urls=browser.find_elements_by_xpath("/html/body/span/b/a")
for url in urls:
z[3]=url.get_attribute("href")
correctlinks.append([z[0],z[1],z[2],url.get_attribute("href")])
urls=browser.find_elements_by_xpath("/html/body/center[2]/table/tbody/tr/td/center/h3[1]/a")
for url in urls:
z[3]=url.get_attribute("href")
correctlinks.append([z[0],z[1],z[2],url.get_attribute("href")])
urls=browser.find_elements_by_xpath("/html/body/ol/li/span[2]/b/a")
for url in urls:
z[3]=url.get_attribute("href")
correctlinks.append([z[0],z[1],z[2],url.get_attribute("href")])
for y in correctlinks:
print capitulo[1] + " " + str(y)
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
browser.close() |