revendrat
diff --git a/‎.env‎
Lines changed: 15 additions & 36 deletions b/‎.env‎
Lines changed: 15 additions & 36 deletions
diff --git a/‎.github/workflows/pib.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/pib.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/pib_daily.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/pib_daily.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/pib_min.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/pib_min.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 22 additions & 0 deletions b/‎README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎chromepkg/chrome.deb‎
-89.5 MB b/‎chromepkg/chrome.deb‎
-89.5 MB
diff --git a/‎pibindia/settings.py‎
Lines changed: 4 additions & 4 deletions b/‎pibindia/settings.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pibindia/spiders/pib.py‎
Lines changed: 31 additions & 24 deletions b/‎pibindia/spiders/pib.py‎
Lines changed: 31 additions & 24 deletions
diff --git a/‎pibindia/spiders/pib_ddmin.py‎ ‎pibindia/spiders/pib_archives.py‎pibindia/spiders/pib_ddmin.py renamed to pibindia/spiders/pib_archives.py
Lines changed: 25 additions & 33 deletions b/‎pibindia/spiders/pib_ddmin.py‎ ‎pibindia/spiders/pib_archives.py‎pibindia/spiders/pib_ddmin.py renamed to pibindia/spiders/pib_archives.py
Lines changed: 25 additions & 33 deletions
@@ -41,13 +41,22 @@ function pib_iter_dates(){
  e_date=$(date -I --date="$2+1day")
   	while [ "${s_date}" != "${e_date}" ] ;do
   		echo -e "\nWorking on articles for the day\t:\t${s_date}"
+ 
+ cyear=$(date "+%Y" --date=${s_date}) || echo -e "enter correct date in format YYYY-MM-DD"
+
+  		if [ ${cyear} -ge 2017 ]; then
+ sel_spider="pib"
+        else
+ sel_spider="pib_archives"
+        fi
+ 
   		if [ $# -eq 2 ]; then
-  		  scrapy crawl pib --nolog -a rel_date="${s_date}"
+  		  scrapy crawl --nolog ${sel_spider} -a rel_date="${s_date}" -a rel_mincode="0"
   		fi
 
   		if [ $# -eq 3 ]; then
  min_code=$3
-  		  scrapy crawl pib_ddmin --nolog -a rel_date="${s_date}" -a rel_mincode=${min_code}
+  		  scrapy crawl --nolog ${sel_spider} -a rel_date="${s_date}" -a rel_mincode=${min_code}
   		fi
 
  s_date=$(date -I --date="$s_date+1day")
@@ -125,22 +134,22 @@ function pib_month(){
 function pib_today(){
 
  t_date=$(date +'%Y-%m-%d')
-	scrapy crawl pib_daily --nolog -a rel_date="$t_date"
+	scrapy crawl --nolog pib_daily -a rel_date="$t_date"
 }
 
 function pib_last_day(){
 
  t_date=$(date +'%Y-%m-%d')
  l_date=$(date -I --date="$t_date-1day")
-	scrapy crawl pib --nolog -a rel_date="$l_date"
+	scrapy crawl --nolog pib_daily -a rel_date="$l_date"
  }
 
 function pib_min(){
  idate_str=$1
  edate_str=$2
  ministry=$3
- sdate=$(date -I --date ${idate_str})
- edate=$(date -I --date ${edate_str})
+ sdate=$(date -I --date="${idate_str}")
+ edate=$(date -I --date="${edate_str}")
   if ! [[ ${ministry} =~ ^[0-9]+$ ]];
   then
       echo -e "\nEnter ministry code"
@@ -546,36 +555,6 @@ function err_handle(){
 	true
 }
 
-function update_chromedriver(){
- chrome_driver_bin="selenium/chromedriver"
- local_chromedriver_ver=$(./${chrome_driver_bin} --version | grep -Eo [0-9].[0-9].* | cut -f 1 -d .)
-	echo -e "\nCurrent chromedriver version is ${local_chromedriver_ver}"
- local_chrome_ver=$(google-chrome --version | grep -Eo [0-9].[0-9].* | cut -f 1 -d .)
-        echo -e "\nLocal chrome version is ${local_chrome_ver}"
-	if [ "${local_chrome_ver}" = "${local_chromedriver_ver}" ]; then
-		echo -e "\nYou are using compatible chromedriver"
-	else
-		echo -e "\nRemoving old chromedriver"
-		rm -f ${chrome_driver_bin}
- online_chrome_ver=$(curl "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${local_chrome_ver}")
-		echo -e "\nonline_chrome_ver ${online_chrome_ver}"
-                echo -e "\nDownloading compatible version"
-		wget -q "https://chromedriver.storage.googleapis.com/${online_chrome_ver}/chromedriver_linux64.zip" || echo -e "\nDownload failed"
-		unzip -p chromedriver_linux64.zip chromedriver > ${chrome_driver_bin} || echo -e "\nunzipping failed"
-		rm -f chromedriver_linux64.zip
-		chmod +x ${chrome_driver_bin}
- new_local_chromedriver_ver=$(./${chrome_driver_bin} --version | grep -Eo [0-9].[0-9].*)
-		echo -e "\nSetup is complete"
-		echo -e "\nNew chromedriver version is now ${new_local_chromedriver_ver}"
-	fi
-}
-
-
-function dwngrd_chrome(){
-sudo apt remove -y google-chrome-stable
-sudo apt install -y ./chromepkg/chrome.deb
-
-}
 function list_ministries(){
   cat ministries.txt
 }
@@ -44,7 +44,6 @@ jobs:
       - name: run script
  run:  |
           source .env
-          dwngrd_chrome
           pib_month ${{ env.PIB_MONTH }} ${{ env.PIB_YEAR }} || err_handle "10" "Spider failed to run"
 
       - name: make txt files
 
@@ -49,7 +49,6 @@ jobs:
       - name: run script
  run:  |
           source .env
-          dwngrd_chrome
           pib_today || err_handle "10" "PIB_TODAY running failed"
           pib_last_day || err_handle "10" "PIB_LAST_DAY running failed"
 
 
@@ -148,7 +148,6 @@ jobs:
       - name: run script
  run:  |
           source .env
-          dwngrd_chrome
           pib_min ${{ inputs.intialDate }} ${{ inputs.lastDate }} ${{ inputs.ministrycode }} || err_handle "10" "Spider failed to run"
 
       - name: make txt files
 
@@ -148,6 +148,28 @@ pib_last_day
 > ```
 > 
 > **For ministry code check ministries.txt file for desired ministry**
+## Read this if you are forking the repo
+
+you may have trouble with github tokens when you are running it after forking it
+click on profile icon > Settings > Developer settings > Personal access tokens > generate new token(classic)
+Name it, no expiration
+in scopes > select repo, workflow, write and delete packages, 
+then generate and copy it
+![IMG_20231117_204211.jpg](https://github.com/nit-in/pib/assets/8947263/f4ae732c-db31-4cda-8dbc-caa8f2496bdc)
+
+
+
+
+ then go to pib repo
+Settings > Secrets and variables > Actions
+here new repository secret
+in name enter PIB ( all in caps)
+and in secret > paste the token you copied
+![IMG_20231117_204101.jpg](https://github.com/nit-in/pib/assets/8947263/df02db15-7238-4f09-b11a-fa86477cdcb3)
+
+
+
+
 
 
 
 
@@ -21,7 +21,7 @@
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 # CONCURRENT_REQUESTS = 32
-CONCURRENT_REQUESTS = 1
+CONCURRENT_REQUESTS = 4
 DUPEFILTER_DEBUG = True
 
 # Configure a delay for requests for the same website (default: 0)
@@ -58,9 +58,9 @@
 #    'pibindia.middlewares.PibindiaDownloaderMiddleware': 543,
 # }
 
-DOWNLOADER_MIDDLEWARES = {"scrapy_selenium.SeleniumMiddleware": 800}
-SELENIUM_DRIVER_NAME = "chrome"
-SELENIUM_DRIVER_ARGUMENTS = ["-headless"]
+# DOWNLOADER_MIDDLEWARES = {"scrapy_selenium.SeleniumMiddleware": 800}
+# SELENIUM_DRIVER_NAME = "chrome"
+# SELENIUM_DRIVER_ARGUMENTS = ["-headless"]
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 
@@ -1,5 +1,5 @@
 import scrapy
-from scrapy_selenium import SeleniumRequest
+from scrapy import FormRequest
 from pathlib import Path
 import requests
 from datetime import datetime
@@ -13,24 +13,20 @@
 url = "https://pib.gov.in/AllRelease.aspx"
 pib_url = "https://pib.gov.in/PressReleaseIframePage.aspx?PRID="
 cwd = Path.cwd()
-chromedriver = "selenium/chromedriver"
-chromedriver_path = Path(cwd, chromedriver).expanduser()
+
 platform_release = str(platform.release())
 today = datetime.today()
 
 
 class PibSpider(scrapy.Spider):
  name = "pib"
- allowed_domains = ["pib.gov.in"]
-
- custom_settings = {
- "DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
- "SELENIUM_DRIVER_EXECUTABLE_PATH": str(chromedriver_path),
-    }
+ start_urls = ["https://pib.gov.in/AllRelease.aspx"]
 
- def start_requests(self):
+ def parse(self, response):
  # self.rel_date = self.rel_date_fn()
  self.strp_date = datetime.strptime(self.rel_date, "%Y-%m-%d")
+ self.minis_code = self.rel_mincode
+
  if (
  self.strp_date.date() == today.date()
  and "azure" in platform_release.lower()
@@ -42,15 +38,28 @@ def start_requests(self):
  self.rel_month = self.strp_date.strftime("%m")
  self.rel_year = self.strp_date.strftime("%Y")
  self.pib_date = self.strp_date.strftime("%Y/%b/%d")
- self.jyr = f"document.forms.form1.ContentPlaceHolder1_ddlYear.value={str(self.rel_year).lstrip('0')};"
- self.jmin = f"document.forms.form1.ContentPlaceHolder1_ddlMinistry.value=0;"
- self.jday = f"document.forms.form1.ContentPlaceHolder1_ddlday.value={str(self.rel_day).lstrip('0')};"
- self.jmon = f"document.forms.form1.ContentPlaceHolder1_ddlMonth.value={str(self.rel_month).lstrip('0')};"
- self.submit = f"document.forms.form1.submit()"
- self.jsub = self.jmin + self.jday + self.jmon + self.jyr + self.submit
- yield SeleniumRequest(url=url, callback=self.parse_js, script=self.jsub)
-
- def parse_js(self, response):
+
+ self.jyr = "ctl00$ContentPlaceHolder1$ddlYear"
+ self.jyrvalue = str(self.rel_year).lstrip("0")
+ self.jmin = "ctl00$ContentPlaceHolder1$ddlMinistry"
+ self.jminvalue = str(self.minis_code)
+ self.jday = "ctl00$ContentPlaceHolder1$ddlday"
+ self.jdayvalue = str(self.rel_day).lstrip("0")
+ self.jmon = "ctl00$ContentPlaceHolder1$ddlMonth"
+ self.jmonvalue = str(self.rel_month).lstrip("0")
+
+ pib_data = {
+ str(self.jmin): str(self.jminvalue),
+ str(self.jday): str(self.jdayvalue),
+ str(self.jmon): str(self.jmonvalue),
+ str(self.jyr): str(self.jyrvalue),
+            }
+
+ yield FormRequest.from_response(
+ response, formdata=pib_data, callback=self.parse_asp
+            )
+
+ def parse_asp(self, response):
  # for i in response.xpath("//div[contains(@class,'content-area')]/ul[contains(@class,'num')]"): #response.css("div.content-area ul.num"):
  # 	print(i.xpath("//h3").extract(),i.xpath("//li/a[contains(@href,'PRID')]").extract(),i.xpath("//h3/following-sibling").extract())
  for articles in response.xpath(
@@ -83,7 +92,7 @@ def parse_js(self, response):
             )
  pib_min = re.sub("[`~!@#$%^&*();:',.+=\"<>|\\/?\n\t\r ]", "", pib_min_un)
  pib_prlink = str(pib_url) + str(pib_prid)
- # print(self.pib_date,pib_min,pib_title,pib_prlink,sep="\n",end="\n\n\n")
+ #  print(self.pib_date, pib_min, pib_title, pib_prlink, sep="\n", end="\n\n\n")
  self.download_article(pib_title, pib_prlink, pib_min, self.pib_date)
 
  def txtfile(self, txtfilepath, art_link):
@@ -92,7 +101,7 @@ def txtfile(self, txtfilepath, art_link):
  txtfilep.touch(exist_ok=True)
 
  if not art_link in txtfilep.read_text():
- with open(str(txtfilep), 'a') as tfile:
+ with open(str(txtfilep), "a") as tfile:
  tfile.write(str(art_link))
  tfile.write("\n")
 
@@ -117,7 +126,7 @@ def download_article(self, art_title, art_link, art_min, art_date):
  text_date = text_art_date.strftime("%d_%b_%Y")
  textf_name = "PIB_LINKS_" + str(text_date) + ".txt"
  textf_path = Path(pib_links_path, str(textf_name)).expanduser()
- 
+
  pdf_path = Path(min_path, art_title).expanduser()
  self.txtfile(str(textf_path), str(art_link))
  ops = {
@@ -144,5 +153,3 @@ def remove_html_entities(self, txt):
  str_html = html.unescape(str(txt))
  str_normalized = normalize("NFKD", str_html)
  return str(str_normalized)
-
-
 
@@ -1,5 +1,4 @@
 import scrapy
-from scrapy_selenium import SeleniumRequest
 from pathlib import Path
 import requests
 from datetime import datetime
@@ -8,31 +7,25 @@
 import platform
 import html
 from unicodedata import normalize
+from scrapy import FormRequest
 
 # url = 'https://archive.pib.gov.in/archive2/erelease.aspx/'
-url = "https://pib.gov.in/AllRelease.aspx"
-pib_url = "https://pib.gov.in/PressReleaseIframePage.aspx?PRID="
+url = "https://archive.pib.gov.in/archive2/erelease.aspx"
+pib_url = "https://archive.pib.gov.in/newsite/PrintRelease.aspx?relid="
 cwd = Path.cwd()
-chromedriver = "selenium/chromedriver"
-chromedriver_path = Path(cwd, chromedriver).expanduser()
 platform_release = str(platform.release())
 today = datetime.today()
 
 
 class PibSpider(scrapy.Spider):
- name = "pib_ddmin"
- allowed_domains = ["pib.gov.in"]
+ name = "pib_archives"
+ start_urls = ["https://archive.pib.gov.in/archive2/erelease.aspx"]
 
- custom_settings = {
- "DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
- "SELENIUM_DRIVER_EXECUTABLE_PATH": str(chromedriver_path),
-    }
-
- def start_requests(self):
+ def parse(self, response):
  # self.rel_date = self.rel_date_fn()
  self.strp_date = datetime.strptime(self.rel_date, "%Y-%m-%d")
  self.minis_code = self.rel_mincode
- 
+
  if (
  self.strp_date.date() == today.date()
  and "azure" in platform_release.lower()
@@ -44,22 +37,23 @@ def start_requests(self):
  self.rel_month = self.strp_date.strftime("%m")
  self.rel_year = self.strp_date.strftime("%Y")
  self.pib_date = self.strp_date.strftime("%Y/%b/%d")
- self.jyr = f"document.forms.form1.ContentPlaceHolder1_ddlYear.value={str(self.rel_year).lstrip('0')};"
- self.jmin = f"document.forms.form1.ContentPlaceHolder1_ddlMinistry.value={str(self.minis_code)};"
- self.jday = f"document.forms.form1.ContentPlaceHolder1_ddlday.value={str(self.rel_day).lstrip('0')};"
- self.jmon = f"document.forms.form1.ContentPlaceHolder1_ddlMonth.value={str(self.rel_month).lstrip('0')};"
- self.submit = f"document.forms.form1.submit()"
- self.jsub = self.jmin + self.jday + self.jmon + self.jyr + self.submit
- yield SeleniumRequest(url=url, callback=self.parse_js, script=self.jsub)
+ self.one = "1|"
+ self.jyr = f"{str(self.rel_year).lstrip('0')}|"
+ self.jmin = f"{str(self.minis_code)}"
+ self.jday = f"{str(self.rel_day).lstrip('0')}|"
+ self.jmon = f"{str(self.rel_month).lstrip('0')}|"
+ self.jsub = self.one + self.jday + self.jmon + self.jyr + self.jmin
+ pib_data = {"__CALLBACKID": "__Page", "__CALLBACKPARAM": str(self.jsub)}
+ yield FormRequest.from_response(
+ response, formdata=pib_data, callback=self.parse_asp
+            )
 
- def parse_js(self, response):
+ def parse_asp(self, response):
  # for i in response.xpath("//div[contains(@class,'content-area')]/ul[contains(@class,'num')]"): #response.css("div.content-area ul.num"):
  # 	print(i.xpath("//h3").extract(),i.xpath("//li/a[contains(@href,'PRID')]").extract(),i.xpath("//h3/following-sibling").extract())
- for articles in response.xpath(
- "//div[contains(@class,'content-area')]/ul[contains(@class,'num')]/li/a[contains(@href,'PRID')]"
-        ):
- pib_prid = str(articles.xpath("@href").get()).split("=", 1)[1]
- pib_title_unnorm = str(articles.xpath("@title").get())[:90]
+ for articles in response.xpath("//li[contains(@onclick,'Getrelease')]"):
+ pib_prid = str(articles.xpath("@id").get())
+ pib_title_unnorm = str(articles.xpath("text()").get())[:90]
  pib_title_norm = self.remove_html_entities(pib_title_unnorm)
  pib_title_un = (
  str(pib_title_norm)
@@ -73,7 +67,7 @@ def parse_js(self, response):
  pib_title = pib_title_re + "_" + str(pib_prid) + ".pdf"
 
  pib_min_unnorm = str(
- articles.xpath("..//preceding-sibling::h3[1]/text()").get()
+ articles.xpath("..//preceding-sibling::li[1]/text()").get()
             )
  pib_min_norm = self.remove_html_entities(pib_min_unnorm)
  pib_min_un = (
@@ -85,7 +79,7 @@ def parse_js(self, response):
             )
  pib_min = re.sub("[`~!@#$%^&*();:',.+=\"<>|\\/?\n\t\r ]", "", pib_min_un)
  pib_prlink = str(pib_url) + str(pib_prid)
- # print(self.pib_date,pib_min,pib_title,pib_prlink,sep="\n",end="\n\n\n")
+ #  print(self.pib_date, pib_min, pib_title, pib_prlink, sep="\n", end="\n\n\n")
  self.download_article(pib_title, pib_prlink, pib_min, self.pib_date)
 
  def txtfile(self, txtfilepath, art_link):
@@ -94,7 +88,7 @@ def txtfile(self, txtfilepath, art_link):
  txtfilep.touch(exist_ok=True)
 
  if not art_link in txtfilep.read_text():
- with open(str(txtfilep), 'a') as tfile:
+ with open(str(txtfilep), "a") as tfile:
  tfile.write(str(art_link))
  tfile.write("\n")
 
@@ -119,7 +113,7 @@ def download_article(self, art_title, art_link, art_min, art_date):
  text_date = text_art_date.strftime("%d_%b_%Y")
  textf_name = "PIB_LINKS_" + str(text_date) + ".txt"
  textf_path = Path(pib_links_path, str(textf_name)).expanduser()
- 
+
  pdf_path = Path(min_path, art_title).expanduser()
  self.txtfile(str(textf_path), str(art_link))
  ops = {
@@ -146,5 +140,3 @@ def remove_html_entities(self, txt):
  str_html = html.unescape(str(txt))
  str_normalized = normalize("NFKD", str_html)
  return str(str_normalized)
-
-