11import scrapy
2- from scrapy_selenium import SeleniumRequest
32from pathlib import Path
43import requests
54from datetime import datetime
87import platform
98import html
109from unicodedata import normalize
10+ from scrapy import FormRequest
1111
1212# url = 'https://archive.pib.gov.in/archive2/erelease.aspx/'
13- url = "https://pib.gov.in/AllRelease .aspx"
14- pib_url = "https://pib.gov.in/PressReleaseIframePage .aspx?PRID ="
13+ url = "https://archive. pib.gov.in/archive2/erelease .aspx"
14+ pib_url = "https://archive. pib.gov.in/newsite/PrintRelease .aspx?relid ="
1515cwd = Path .cwd ()
16- chromedriver = "selenium/chromedriver"
17- chromedriver_path = Path (cwd , chromedriver ).expanduser ()
1816platform_release = str (platform .release ())
1917today = datetime .today ()
2018
2119
2220class PibSpider (scrapy .Spider ):
23- name = "pib_ddmin "
24- allowed_domains = ["pib.gov.in" ]
21+ name = "pib_archives "
22+ start_urls = ["https://archive. pib.gov.in/archive2/erelease.aspx " ]
2523
26- custom_settings = {
27- "DUPEFILTER_CLASS" : "scrapy.dupefilters.BaseDupeFilter" ,
28- "SELENIUM_DRIVER_EXECUTABLE_PATH" : str (chromedriver_path ),
29- }
30-
31- def start_requests (self ):
24+ def parse (self , response ):
3225 # self.rel_date = self.rel_date_fn()
3326 self .strp_date = datetime .strptime (self .rel_date , "%Y-%m-%d" )
3427 self .minis_code = self .rel_mincode
35-
28+
3629 if (
3730 self .strp_date .date () == today .date ()
3831 and "azure" in platform_release .lower ()
@@ -44,22 +37,23 @@ def start_requests(self):
4437 self .rel_month = self .strp_date .strftime ("%m" )
4538 self .rel_year = self .strp_date .strftime ("%Y" )
4639 self .pib_date = self .strp_date .strftime ("%Y/%b/%d" )
47- self .jyr = f"document.forms.form1.ContentPlaceHolder1_ddlYear.value={ str (self .rel_year ).lstrip ('0' )} ;"
48- self .jmin = f"document.forms.form1.ContentPlaceHolder1_ddlMinistry.value={ str (self .minis_code )} ;"
49- self .jday = f"document.forms.form1.ContentPlaceHolder1_ddlday.value={ str (self .rel_day ).lstrip ('0' )} ;"
50- self .jmon = f"document.forms.form1.ContentPlaceHolder1_ddlMonth.value={ str (self .rel_month ).lstrip ('0' )} ;"
51- self .submit = f"document.forms.form1.submit()"
52- self .jsub = self .jmin + self .jday + self .jmon + self .jyr + self .submit
53- yield SeleniumRequest (url = url , callback = self .parse_js , script = self .jsub )
40+ self .one = "1|"
41+ self .jyr = f"{ str (self .rel_year ).lstrip ('0' )} |"
42+ self .jmin = f"{ str (self .minis_code )} "
43+ self .jday = f"{ str (self .rel_day ).lstrip ('0' )} |"
44+ self .jmon = f"{ str (self .rel_month ).lstrip ('0' )} |"
45+ self .jsub = self .one + self .jday + self .jmon + self .jyr + self .jmin
46+ pib_data = {"__CALLBACKID" : "__Page" , "__CALLBACKPARAM" : str (self .jsub )}
47+ yield FormRequest .from_response (
48+ response , formdata = pib_data , callback = self .parse_asp
49+ )
5450
55- def parse_js (self , response ):
51+ def parse_asp (self , response ):
5652 # for i in response.xpath("//div[contains(@class,'content-area')]/ul[contains(@class,'num')]"): #response.css("div.content-area ul.num"):
5753 # print(i.xpath("//h3").extract(),i.xpath("//li/a[contains(@href,'PRID')]").extract(),i.xpath("//h3/following-sibling").extract())
58- for articles in response .xpath (
59- "//div[contains(@class,'content-area')]/ul[contains(@class,'num')]/li/a[contains(@href,'PRID')]"
60- ):
61- pib_prid = str (articles .xpath ("@href" ).get ()).split ("=" , 1 )[1 ]
62- pib_title_unnorm = str (articles .xpath ("@title" ).get ())[:90 ]
54+ for articles in response .xpath ("//li[contains(@onclick,'Getrelease')]" ):
55+ pib_prid = str (articles .xpath ("@id" ).get ())
56+ pib_title_unnorm = str (articles .xpath ("text()" ).get ())[:90 ]
6357 pib_title_norm = self .remove_html_entities (pib_title_unnorm )
6458 pib_title_un = (
6559 str (pib_title_norm )
@@ -73,7 +67,7 @@ def parse_js(self, response):
7367 pib_title = pib_title_re + "_" + str (pib_prid ) + ".pdf"
7468
7569 pib_min_unnorm = str (
76- articles .xpath ("..//preceding-sibling::h3 [1]/text()" ).get ()
70+ articles .xpath ("..//preceding-sibling::li [1]/text()" ).get ()
7771 )
7872 pib_min_norm = self .remove_html_entities (pib_min_unnorm )
7973 pib_min_un = (
@@ -85,7 +79,7 @@ def parse_js(self, response):
8579 )
8680 pib_min = re .sub ("[`~!@#$%^&*();:',.+=\" <>|\\ /?\n \t \r ]" , "" , pib_min_un )
8781 pib_prlink = str (pib_url ) + str (pib_prid )
88- # print(self.pib_date,pib_min,pib_title,pib_prlink,sep="\n",end="\n\n\n")
82+ # print(self.pib_date, pib_min, pib_title, pib_prlink, sep="\n", end="\n\n\n")
8983 self .download_article (pib_title , pib_prlink , pib_min , self .pib_date )
9084
9185 def txtfile (self , txtfilepath , art_link ):
@@ -94,7 +88,7 @@ def txtfile(self, txtfilepath, art_link):
9488 txtfilep .touch (exist_ok = True )
9589
9690 if not art_link in txtfilep .read_text ():
97- with open (str (txtfilep ), 'a' ) as tfile :
91+ with open (str (txtfilep ), "a" ) as tfile :
9892 tfile .write (str (art_link ))
9993 tfile .write ("\n " )
10094
@@ -119,7 +113,7 @@ def download_article(self, art_title, art_link, art_min, art_date):
119113 text_date = text_art_date .strftime ("%d_%b_%Y" )
120114 textf_name = "PIB_LINKS_" + str (text_date ) + ".txt"
121115 textf_path = Path (pib_links_path , str (textf_name )).expanduser ()
122-
116+
123117 pdf_path = Path (min_path , art_title ).expanduser ()
124118 self .txtfile (str (textf_path ), str (art_link ))
125119 ops = {
@@ -146,5 +140,3 @@ def remove_html_entities(self, txt):
146140 str_html = html .unescape (str (txt ))
147141 str_normalized = normalize ("NFKD" , str_html )
148142 return str (str_normalized )
149-
150-
0 commit comments