javascript - Scrapy crawling not working on ASPX website -
i'm scraping madrid assembly's website, built in aspx, , have no idea how simulate clicks on links need corresponding politicians from. tried this:
import scrapy class asambleamadrid(scrapy.spider): name = "asamblea_madrid" start_urls = ['http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx'] def parse(self, response): id in response.css('div#modulobusqueda div.sangria div.sangria ul li a::attr(id)'): target = id.extract() url = "http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx" formdata= {'__eventtarget': target, '__viewstate': '/wepdwubma9kfgjmd2qwagibd2qwbaibd2qwagigd2qwamypzbycagmpzbycagmpfgiee1byzxzpb3vzq29udhjvbe1vzgulkygbtwljcm9zb2z0llnoyxjlug9pbnquv2viq29udhjvbhmuu1bdb250cm9stw9kzswgtwljcm9zb2z0llnoyxjlug9pbnqsifzlcnnpb249mtqumc4wljasien1bhr1cmu9bmv1dhjhbcwguhvibgljs2v5vg9rzw49nzflowjjztexmwu5ndi5ywfkagmpzbymagmpzbygbsznxzm2zwewmzewxzg5m2rfngexov85zwqxxzg4ytezm2qwnjqymw9kfgjmd2qwagibdxychgtfiul0zw1db3vudaiefghmd2qwagibdw8wbb4pq29tbwfuzefyz3vtzw50btrhcnvwbybqyxjsyw1lbnrhcmlvifbvchvsyxigzgugbgegqxnhbwjszwegzgugtwfkcmlkhgruzxh0btrhcnvwbybqyxjsyw1lbnrhcmlvifbvchvsyxigzgugbgegqxnhbwjszwegzgugtwfkcmlkzgqcaq9kfgicaq8pfgqfaguer3j1cg8gugfybgftzw50yxjpbybtb2npywxpc3rhhwmfhkdydxbvifbhcmxhbwvudgfyaw8gu29jawfsaxn0ywrkagipzbycagepdxyehwifl0dydxbvifbhcmxhbwvudgfyaw8gug9kzw1vcybdb211bmlkywqgzgugtwfkcmlkhwmfl0dydxbvifbhcmxhbwvudgfyaw8gug9kzw1vcybdb211bmlkywqgzgugtwfkcmlkzgqcaw9kfgicaq8pfgqfaguhr3j1cg8gugfybgftzw50yxjpbybkzsbdaxvkywrhbm9zhwmfiudydxbvifbhcmxhbwvudgfyaw8gzgugq2l1zgfkyw5vc2rkbsznx2mxntfkmgixxzy2ywzfndhjy185mwm3x2jloguxmtzkn2q1mg9kfgrmdxychgdwaxnpymxlagqcaq8wah8eagqfjmdfztbmywvimtvfogi3nl80mjgyx2exyjffnti3zdiwnjk1ody2d2qwbgypfgifbghkagepfgifbghkahepzbycagepzbyezg9kfgicaq8wah8eabyczg9kfgqcag9kfgqcaq8wah8eagqcaw8wcb4tq2xpzw50t25dbglja1njcmlwdaw7awphdmfty3jpchq6q29yzuludm9rzsgnvgfrzu9mzmxpbmvub0nsawvudfjlywwnldesidesicdodhrwolx1mdayzlx1mdayznd3dy5hc2ftymxlyw1hzhjpzc5lc1x1mdayzkvtxhuwmdjmuxvlrxnmyufzyw1ibgvhxhuwmdjmq29tcg9zawnpb25kzwxhqxnhbwjszwfcdtawmmzmb3neaxb1dgfkb3mnlcatmswgltesiccnlcanjykegensawvude9uq2xpy2toyxzpz2f0zvvybgqekensawvude9uq2xpy2tty3jpchrdb250ywluaw5nuhjlzml4zwrvcmxkhgxiawrkzw5ty3jpchqfivrha2vpzmzsaw5lrglzywjszwqomswgmswgltesic0xkwqcaw8pfgoecufjy2vzc0tlequblx4pqxjyb3djbwfnzvdpzhroagueeefycm93sw1hz2vizwlnahqcax4rqxjyb3djbwfnzu9mznnldfhmhhfbcnjvd0ltywdlt2zmc2v0wqlra2rkagepzbycagupzbycagepebychwrozbqraqbkahcpzbyizg8pfgqfawuprw5nbglzacbwzxjzaw9uhgtoyxzpz2f0zvvybavfl0vol1f1zuvztgfbc2ftymxlys9db21wb3npy2lvbmrlbgfbc2ftymxlys9mb3neaxb1dgfkb3mvugfnzxmvumvsywnpb25bbgzhymv0awnhrglwdxrhzg9zlmfzchhkzaicdw8wbb8dbqzqcmvuc2efdguyl0vtl0jpzw52zw5pzgfqcmvuc2evugfnaw5hcy9cawvudmvuawrhuhjlbnnhlmfzchhkzaiedw8wbb8dbrpjzgvudglmawnhy2nds24gzgugvxn1yxjpbx8obtqvrvmvqxjlyvvzdwfyaw9zl1bhz2luyxmvswrlbnrpzmljywnpb25vc3vhcmlvcy5hc3b4zgqcbg8pfgqfawugq29ycmvvhw4fkgh0dha6ly9vdxrsb29rlmnvbs9vd2evyxnhbwjszwftywryawquzxnkzaild2qwagidd2qwagibdxychwalkwqbzai1d2qwagihd2qwagibdw8wah8eagqwagidd2qwamypzbycagmpzbycagupdxyehgzizwlnahqbaaaaaaaaeuabaaaahgrfivncaoabzbycageppcsacqeadxyehg1qyxrou2vwyxjhdg9ybagedu5ldmvyrxhwyw5kzwrnzgqcsq9kfgicag9kfgicaq9kfgicaw8wah8acyseawqyagvby3rsmdakugxhy2vib2xkzxjmzwz0tmf2qmfyjfvjvmvyc2lvbmvkq29udgvuddmkvjrrdwlja0xhdw5jae1lbnupd2qfkunvbxbvc2ljacozbibkzsbsysbbc2ftymxlyvxmb3mgrglwdxrhzg9zzavhy3rsmdakugxhy2vib2xkzxjub3boyxzcyxikugxhy2vib2xkzxjib3jpem9udgfstmf2jfrvce5hdmlnyxrpb25nzw51vjqpd2qfgkluawnpb1xrdcopigvzigxhiefzyw1ibgvhzj', '__eventvalidation': '/wewcalihqvyawkh2yvvaudf1kudaqck1buoaqckybkpaqcknbqcaqckszejavejv84dtkx5dcfr3qgqqd2wsfqh8np3iq8', '__viewstategenerator': 'bab98cb3', '__requestdigest': '0x476239970dcfdabdbbdf638a1f9b026bd43022a10d1d757b05f1071ff3104459b4666f96a47b4845d625bcb2be0d88c6e150945e8f5d82c189b56a0da4bc859d'} yield scrapy.formrequest(url=url, formdata= formdata, callback=self.takeeachparty) def takeeachparty(self, response): print response.css('ul.listadovert02 ul li::text').extract()
going source code of website, can see how links like, , how send javascript query. 1 of links need access:
<a id="ctl00_m_g_36ea0310_893d_4a19_9ed1_88a133d06423_ctl00_repeater1_ctl00_lnk_grupo" href="javascript:webform_dopostbackwithoptions(new webform_postbackoptions("ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$repeater1$ctl00$lnk_grupo", "", true, "", "", false, true))">grupo parlamentario popular de la asamblea de madrid</a>
i have been reading many articles about, problem ignorance in respect.
thanks in advance.
edited:
solution: did it! translating helpul code padraic cunningham scrapy way. specified issue scrapy, want post result in case has same problem had.
so here goes:
import scrapy import js2xml class asambleamadrid(scrapy.spider): name = "asambleamadrid" start_urls = ['http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx'] def parse(self, response): source = response hrefs = response.xpath("//*[@id='modulobusqueda']//div[@class='sangria']/ul/li/a/@href").extract() form_data = self.validate(source) ref in hrefs: # js2xml allows parse js function , params, , grab __eventtarget js_xml = js2xml.parse(ref) _id = js_xml.xpath( "//identifier[@name='webform_postbackoptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[0] form_data["__eventtarget"] = _id.text url_diputado = 'http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx' # proper way send post in scrapy using formrequest yield scrapy.formrequest(url=url_diputado, formdata=form_data, callback=self.extract_parties, method='post') def validate(self, source): # these fields minimum required cannot hardcoded data = {"__viewstategenerator": source.xpath("//*[@id='__viewstategenerator']/@value")[0].extract(), "__eventvalidation": source.xpath("//*[@id='__eventvalidation']/@value")[0].extract(), "__viewstate": source.xpath("//*[@id='__viewstate']/@value")[0].extract(), " __requestdigest": source.xpath("//*[@id='__requestdigest']/@value")[0].extract()} return data def extract_parties(self, response): source = response name = source.xpath("//ul[@class='listadovert02']/ul/li/a/text()").extract() print name
i hope clear. everybody, again!
if @ data posted form in chrome or firebug can see there many fields passed in post request, there few essential , must parsed original page, parsing ids div.sangria ul li a
tags not sufficient actual data posted different, posted in javascript function, webform_dopostbackwithoptions
in href not id attribute:
href='javascript:webform_dopostbackwithoptions(new webform_postbackoptions("ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$repeater1$ctl03$lnk_grupo", "", true, "", "", false, true))'>
sometimes underscores replaced dollar signs easy str.replace them in correct order not in case, use regex parse js2xml lib can parse javascript function , args xml tree.
the following code using requests shows how can data initial request , pages want:
import requests lxml import html import js2xml post = "http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx" def validate(xml): # these fields minimum required cannot hardcoded data = {"__viewstategenerator": xml.xpath("//*[@id='__viewstategenerator']/@value")[0], "__eventvalidation": xml.xpath("//*[@id='__eventvalidation']/@value")[0], "__viewstate": xml.xpath("//*[@id='__viewstate']/@value")[0], " __requestdigest": xml.xpath("//*[@id='__requestdigest']/@value")[0]} return data requests.session() s: # make initial requests links/hrefs , fields r = s.get( "http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx") xml = html.fromstring(r.content) hrefs = xml.xpath("//*[@id='modulobusqueda']//div[@class='sangria']/ul/li/a/@href") form_data = validate(xml) h in hrefs: js_xml = js2xml.parse(h) _id = js_xml.xpath( "//identifier[@name='webform_postbackoptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[ 0] form_data["__eventtarget"] = _id.text r = s.post(post, data=form_data) xml = html.fromstring(r.content) print(xml.xpath("//ul[@class='listadovert02']/ul/li/a/text()"))
if run code above see different text output teh anchor tags:
in [2]: requests.session() s: ...: r = s.get( ...: "http://www.asambleamadrid.es/es/queeslaasamblea/composiciondelaasamblea/losdiputados/paginas/relacionalfabeticadiputados.aspx") ...: xml = html.fromstring(r.content) ...: hrefs = xml.xpath("//*[@id='modulobusqueda']//div[@class='sangria']/ul/li/a/@href") ...: form_data = validate(xml) ...: h in hrefs: ...: js_xml = js2xml.parse(h) ...: _id = js_xml.xpath( ...: "//identifier[@name='webform_postbackoptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[ ...: 0] ...: form_data["__eventtarget"] = _id.text ...: r = s.post(post, data=form_data) ...: xml = html.fromstring(r.content) ...: print(xml.xpath("//ul[@class='listadovert02']/ul/li/a/text()")) ...: [u'abo\xedn abo\xedn, sonsoles trinidad', u'adrados gautier, m\xaa paloma', u'aguado del olmo, m\xaa josefa', u'\xc1lvarez padilla, m\xaa nadia', u'arribas del barrio, jos\xe9 m\xaa', u'ballar\xedn valc\xe1rcel, \xc1lvaro c\xe9sar', u'berrio fern\xe1ndez-caballero, m\xaa in\xe9s', u'berzal andrade, jos\xe9 manuel', u'cam\xedns mart\xednez, ana', u'carballedo berlanga, m\xaa eugenia', 'cifuentes cuencas, cristina', u'd\xedaz ayuso, isabel natividad', u'escudero d\xedaz-tejeiro, marta', u'fermosel d\xedaz, jes\xfas', u'fern\xe1ndez-quejo del pozo, jos\xe9 luis', u'garc\xeda de vinuesa gardoqui, ignacio', u'garc\xeda mart\xedn, mar\xeda bego\xf1a', u'garrido garc\xeda, \xc1ngel', u'g\xf3mez ruiz, jes\xfas', u'g\xf3mez-angulo rodr\xedguez, juan antonio', u'gonz\xe1lez gonz\xe1lez, isabel gema', u'gonz\xe1lez jim\xe9nez, bartolom\xe9', u'gonz\xe1lez taboada, jaime', u'gonz\xe1lez-mo\xf1ux v\xe1zquez, elena', u'gonzalo l\xf3pez, rosal\xeda', 'izquierdo torres, carlos', u'li\xe9bana montijano, pilar', u'mari\xf1o ortega, ana isabel', u'moraga valiente, \xc1lvaro', u'mu\xf1oz abrines, pedro', u'n\xfa\xf1ez guijarro, jos\xe9 enrique', u'olmo fl\xf3rez, luis del', u'ongil cores, m\xaa gador', 'ortiz espejo, daniel', u'ossorio crespo, enrique mat\xedas', 'peral guerra, luis', u'p\xe9rez baos, ana isabel', u'p\xe9rez garc\xeda, david', u'pla\xf1iol de lacalle, regina m\xaa', u'redondo alcaide, m\xaa isabel', u'roll\xe1n ojeda, pedro', u's\xe1nchez fern\xe1ndez, alejandro', 'sanjuanbenito bonal, diego', u'serrano guio, jos\xe9 tom\xe1s', u'serrano s\xe1nchez-capuchino, alfonso carlos', 'soler-espiauba gallo, juan', 'toledo moreno, lucila', 'van-halen acedo, juan'] [u'andaluz andaluz, m\xaa isabel', u'ardid jim\xe9nez, m\xaa isabel', u'carazo g\xf3mez, m\xf3nica', u'casares d\xedaz, m\xaa luc\xeda inmaculada', u'cepeda garc\xeda de le\xf3n, jos\xe9 carmelo', 'cruz torrijos, diego', u'delgado g\xf3mez, carla', u'franco pardo, jos\xe9 manuel', u'freire campo, jos\xe9 manuel', u'gabilondo pujol, \xc1ngel', 'gallizo llamas, mercedes', u"garc\xeda d'atri, ana", u'garc\xeda-rojo garrido, pedro pablo', u'g\xf3mez montoya, rafael', u'g\xf3mez-chamorro torres, jos\xe9 \xc1ngel', u'gonz\xe1lez gonz\xe1lez, m\xf3nica silvana', u'leal fern\xe1ndez, m\xaa isaura', u'llop cuenca, m\xaa pilar', 'lobato gandarias, juan', u'l\xf3pez ruiz, m\xaa carmen', u'manguan valderrama, eva m\xaa', u'maroto illera, m\xaa reyes', u'mart\xednez ten, carmen', u'mena romero, m\xaa carmen', u'moreno navarro, juan jos\xe9', u'moya nieto, encarnaci\xf3n', 'navarro lanchas, josefa', 'nolla estrada, modesto', 'pardo ortiz, josefa dolores', u'quintana viar, jos\xe9', u'rico garc\xeda-hierro, enrique', u'rodr\xedguez garc\xeda, nicol\xe1s', u's\xe1nchez acera, pilar', u'sant\xedn fern\xe1ndez, pedro', 'segovia noriega, juan', 'vicente viondi, daniel', u'vinagre alc\xe1zar, agust\xedn'] ['abasolo pozas, olga', 'ardanuy pizarro, miguel', u'beirak ulanosky, jazm\xedn', u'camargo fern\xe1ndez, ra\xfal', 'candela pokorna, marco', 'delgado orgaz, emilio', u'd\xedaz rom\xe1n, laura', u'espinar merino, ram\xf3n', u'espinosa de la llave, mar\xeda', u'fern\xe1ndez rubi\xf1o, eduardo', u'garc\xeda g\xf3mez, m\xf3nica', 'gimeno reinoso, beatriz', u'guti\xe9rrez benito, eduardo', 'huerta bravo, raquel', u'l\xf3pez hern\xe1ndez, isidro', u'l\xf3pez rodrigo, jos\xe9 manuel', u'mart\xednez abarca, hugo', u'morano gonz\xe1lez, jacinto', u'ongil l\xf3pez, miguel', 'padilla estrada, pablo', u'ruiz-huerta garc\xeda de viedma, lorena', 'salazar-alonso revuelta, cecilia', u'san jos\xe9 p\xe9rez, carmen', u's\xe1nchez p\xe9rez, alejandro', u'serra s\xe1nchez, isabel', u'serra s\xe1nchez, clara', 'sevillano de las heras, elena'] [u'aguado crespo, ignacio jes\xfas', u'\xc1lvarez cabo, daniel', u'gonz\xe1lez pastor, dolores', u'iglesia vicente, m\xaa teresa de la', 'lara casanova, francisco', u'marb\xe1n de frutos, marta', u'marcos arias, tom\xe1s', u'meg\xedas morales, jes\xfas ricardo', u'n\xfa\xf1ez s\xe1nchez, roberto', 'reyero zubiri, alberto', u'rodr\xedguez dur\xe1n, ana', u'rubio ruiz, juan ram\xf3n', u'ruiz fern\xe1ndez, esther', u'sol\xeds p\xe9rez, susana', 'trinidad martos, juan', 'veloso lozano, enrique', u'zafra hern\xe1ndez, c\xe9sar']
you can add exact same logic spider, used requests show working example. should aware not every asp.net site behaves same, may have re-validate every post in related answer.