Skip to content

Commit 0a2fc98

Browse files
authored
Merge pull request #1 from nit-in/master
pr for articles prior to 2017
2 parents 5f7daed + 580260a commit 0a2fc98

13 files changed

Lines changed: 124 additions & 125 deletions

File tree

.env

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,22 @@ function pib_iter_dates(){
4141
e_date=$(date -I --date="$2+1day")
4242
while [ "${s_date}" != "${e_date}" ] ;do
4343
echo -e "\nWorking on articles for the day\t:\t${s_date}"
44+
45+
cyear=$(date "+%Y" --date=${s_date}) || echo -e "enter correct date in format YYYY-MM-DD"
46+
47+
if [ ${cyear} -ge 2017 ]; then
48+
sel_spider="pib"
49+
else
50+
sel_spider="pib_archives"
51+
fi
52+
4453
if [ $# -eq 2 ]; then
45-
scrapy crawl pib --nolog -a rel_date="${s_date}"
54+
scrapy crawl --nolog ${sel_spider} -a rel_date="${s_date}" -a rel_mincode="0"
4655
fi
4756

4857
if [ $# -eq 3 ]; then
4958
min_code=$3
50-
scrapy crawl pib_ddmin --nolog -a rel_date="${s_date}" -a rel_mincode=${min_code}
59+
scrapy crawl --nolog ${sel_spider} -a rel_date="${s_date}" -a rel_mincode=${min_code}
5160
fi
5261

5362
s_date=$(date -I --date="$s_date+1day")
@@ -125,22 +134,22 @@ function pib_month(){
125134
function pib_today(){
126135

127136
t_date=$(date +'%Y-%m-%d')
128-
scrapy crawl pib_daily --nolog -a rel_date="$t_date"
137+
scrapy crawl --nolog pib_daily -a rel_date="$t_date"
129138
}
130139

131140
function pib_last_day(){
132141

133142
t_date=$(date +'%Y-%m-%d')
134143
l_date=$(date -I --date="$t_date-1day")
135-
scrapy crawl pib --nolog -a rel_date="$l_date"
144+
scrapy crawl --nolog pib_daily -a rel_date="$l_date"
136145
}
137146

138147
function pib_min(){
139148
idate_str=$1
140149
edate_str=$2
141150
ministry=$3
142-
sdate=$(date -I --date ${idate_str})
143-
edate=$(date -I --date ${edate_str})
151+
sdate=$(date -I --date="${idate_str}")
152+
edate=$(date -I --date="${edate_str}")
144153
if ! [[ ${ministry} =~ ^[0-9]+$ ]];
145154
then
146155
echo -e "\nEnter ministry code"
@@ -546,36 +555,6 @@ function err_handle(){
546555
true
547556
}
548557

549-
function update_chromedriver(){
550-
chrome_driver_bin="selenium/chromedriver"
551-
local_chromedriver_ver=$(./${chrome_driver_bin} --version | grep -Eo [0-9].[0-9].* | cut -f 1 -d .)
552-
echo -e "\nCurrent chromedriver version is ${local_chromedriver_ver}"
553-
local_chrome_ver=$(google-chrome --version | grep -Eo [0-9].[0-9].* | cut -f 1 -d .)
554-
echo -e "\nLocal chrome version is ${local_chrome_ver}"
555-
if [ "${local_chrome_ver}" = "${local_chromedriver_ver}" ]; then
556-
echo -e "\nYou are using compatible chromedriver"
557-
else
558-
echo -e "\nRemoving old chromedriver"
559-
rm -f ${chrome_driver_bin}
560-
online_chrome_ver=$(curl "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${local_chrome_ver}")
561-
echo -e "\nonline_chrome_ver ${online_chrome_ver}"
562-
echo -e "\nDownloading compatible version"
563-
wget -q "https://chromedriver.storage.googleapis.com/${online_chrome_ver}/chromedriver_linux64.zip" || echo -e "\nDownload failed"
564-
unzip -p chromedriver_linux64.zip chromedriver > ${chrome_driver_bin} || echo -e "\nunzipping failed"
565-
rm -f chromedriver_linux64.zip
566-
chmod +x ${chrome_driver_bin}
567-
new_local_chromedriver_ver=$(./${chrome_driver_bin} --version | grep -Eo [0-9].[0-9].*)
568-
echo -e "\nSetup is complete"
569-
echo -e "\nNew chromedriver version is now ${new_local_chromedriver_ver}"
570-
fi
571-
}
572-
573-
574-
function dwngrd_chrome(){
575-
sudo apt remove -y google-chrome-stable
576-
sudo apt install -y ./chromepkg/chrome.deb
577-
578-
}
579558
function list_ministries(){
580559
cat ministries.txt
581560
}

.github/workflows/pib.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ jobs:
4444
- name: run script
4545
run: |
4646
source .env
47-
dwngrd_chrome
4847
pib_month ${{ env.PIB_MONTH }} ${{ env.PIB_YEAR }} || err_handle "10" "Spider failed to run"
4948
5049
- name: make txt files

.github/workflows/pib_daily.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ jobs:
4949
- name: run script
5050
run: |
5151
source .env
52-
dwngrd_chrome
5352
pib_today || err_handle "10" "PIB_TODAY running failed"
5453
pib_last_day || err_handle "10" "PIB_LAST_DAY running failed"
5554

.github/workflows/pib_min.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,6 @@ jobs:
148148
- name: run script
149149
run: |
150150
source .env
151-
dwngrd_chrome
152151
pib_min ${{ inputs.intialDate }} ${{ inputs.lastDate }} ${{ inputs.ministrycode }} || err_handle "10" "Spider failed to run"
153152
154153
- name: make txt files

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,28 @@ pib_last_day
148148
> ```
149149
>
150150
> **For ministry code check ministries.txt file for desired ministry**
151+
## Read this if you are forking the repo
152+
153+
you may have trouble with github tokens when you are running it after forking it
154+
click on profile icon > Settings > Developer settings > Personal access tokens > generate new token(classic)
155+
Name it, no expiration
156+
in scopes > select repo, workflow, write and delete packages,
157+
then generate and copy it
158+
![IMG_20231117_204211.jpg](https://github.com/nit-in/pib/assets/8947263/f4ae732c-db31-4cda-8dbc-caa8f2496bdc)
159+
160+
161+
162+
163+
then go to pib repo
164+
Settings > Secrets and variables > Actions
165+
here new repository secret
166+
in name enter PIB ( all in caps)
167+
and in secret > paste the token you copied
168+
![IMG_20231117_204101.jpg](https://github.com/nit-in/pib/assets/8947263/df02db15-7238-4f09-b11a-fa86477cdcb3)
169+
170+
171+
172+
151173
152174
153175

chromepkg/chrome.deb

-89.5 MB
Binary file not shown.

pibindia/settings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
# Configure maximum concurrent requests performed by Scrapy (default: 16)
2323
# CONCURRENT_REQUESTS = 32
24-
CONCURRENT_REQUESTS = 1
24+
CONCURRENT_REQUESTS = 4
2525
DUPEFILTER_DEBUG = True
2626

2727
# Configure a delay for requests for the same website (default: 0)
@@ -58,9 +58,9 @@
5858
# 'pibindia.middlewares.PibindiaDownloaderMiddleware': 543,
5959
# }
6060

61-
DOWNLOADER_MIDDLEWARES = {"scrapy_selenium.SeleniumMiddleware": 800}
62-
SELENIUM_DRIVER_NAME = "chrome"
63-
SELENIUM_DRIVER_ARGUMENTS = ["-headless"]
61+
# DOWNLOADER_MIDDLEWARES = {"scrapy_selenium.SeleniumMiddleware": 800}
62+
# SELENIUM_DRIVER_NAME = "chrome"
63+
# SELENIUM_DRIVER_ARGUMENTS = ["-headless"]
6464

6565
# Enable or disable extensions
6666
# See https://docs.scrapy.org/en/latest/topics/extensions.html

pibindia/spiders/pib.py

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import scrapy
2-
from scrapy_selenium import SeleniumRequest
2+
from scrapy import FormRequest
33
from pathlib import Path
44
import requests
55
from datetime import datetime
@@ -13,24 +13,20 @@
1313
url = "https://pib.gov.in/AllRelease.aspx"
1414
pib_url = "https://pib.gov.in/PressReleaseIframePage.aspx?PRID="
1515
cwd = Path.cwd()
16-
chromedriver = "selenium/chromedriver"
17-
chromedriver_path = Path(cwd, chromedriver).expanduser()
16+
1817
platform_release = str(platform.release())
1918
today = datetime.today()
2019

2120

2221
class PibSpider(scrapy.Spider):
2322
name = "pib"
24-
allowed_domains = ["pib.gov.in"]
25-
26-
custom_settings = {
27-
"DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
28-
"SELENIUM_DRIVER_EXECUTABLE_PATH": str(chromedriver_path),
29-
}
23+
start_urls = ["https://pib.gov.in/AllRelease.aspx"]
3024

31-
def start_requests(self):
25+
def parse(self, response):
3226
# self.rel_date = self.rel_date_fn()
3327
self.strp_date = datetime.strptime(self.rel_date, "%Y-%m-%d")
28+
self.minis_code = self.rel_mincode
29+
3430
if (
3531
self.strp_date.date() == today.date()
3632
and "azure" in platform_release.lower()
@@ -42,15 +38,28 @@ def start_requests(self):
4238
self.rel_month = self.strp_date.strftime("%m")
4339
self.rel_year = self.strp_date.strftime("%Y")
4440
self.pib_date = self.strp_date.strftime("%Y/%b/%d")
45-
self.jyr = f"document.forms.form1.ContentPlaceHolder1_ddlYear.value={str(self.rel_year).lstrip('0')};"
46-
self.jmin = f"document.forms.form1.ContentPlaceHolder1_ddlMinistry.value=0;"
47-
self.jday = f"document.forms.form1.ContentPlaceHolder1_ddlday.value={str(self.rel_day).lstrip('0')};"
48-
self.jmon = f"document.forms.form1.ContentPlaceHolder1_ddlMonth.value={str(self.rel_month).lstrip('0')};"
49-
self.submit = f"document.forms.form1.submit()"
50-
self.jsub = self.jmin + self.jday + self.jmon + self.jyr + self.submit
51-
yield SeleniumRequest(url=url, callback=self.parse_js, script=self.jsub)
52-
53-
def parse_js(self, response):
41+
42+
self.jyr = "ctl00$ContentPlaceHolder1$ddlYear"
43+
self.jyrvalue = str(self.rel_year).lstrip("0")
44+
self.jmin = "ctl00$ContentPlaceHolder1$ddlMinistry"
45+
self.jminvalue = str(self.minis_code)
46+
self.jday = "ctl00$ContentPlaceHolder1$ddlday"
47+
self.jdayvalue = str(self.rel_day).lstrip("0")
48+
self.jmon = "ctl00$ContentPlaceHolder1$ddlMonth"
49+
self.jmonvalue = str(self.rel_month).lstrip("0")
50+
51+
pib_data = {
52+
str(self.jmin): str(self.jminvalue),
53+
str(self.jday): str(self.jdayvalue),
54+
str(self.jmon): str(self.jmonvalue),
55+
str(self.jyr): str(self.jyrvalue),
56+
}
57+
58+
yield FormRequest.from_response(
59+
response, formdata=pib_data, callback=self.parse_asp
60+
)
61+
62+
def parse_asp(self, response):
5463
# for i in response.xpath("//div[contains(@class,'content-area')]/ul[contains(@class,'num')]"): #response.css("div.content-area ul.num"):
5564
# print(i.xpath("//h3").extract(),i.xpath("//li/a[contains(@href,'PRID')]").extract(),i.xpath("//h3/following-sibling").extract())
5665
for articles in response.xpath(
@@ -83,7 +92,7 @@ def parse_js(self, response):
8392
)
8493
pib_min = re.sub("[`~!@#$%^&*();:',.+=\"<>|\\/?\n\t\r ]", "", pib_min_un)
8594
pib_prlink = str(pib_url) + str(pib_prid)
86-
# print(self.pib_date,pib_min,pib_title,pib_prlink,sep="\n",end="\n\n\n")
95+
# print(self.pib_date, pib_min, pib_title, pib_prlink, sep="\n", end="\n\n\n")
8796
self.download_article(pib_title, pib_prlink, pib_min, self.pib_date)
8897

8998
def txtfile(self, txtfilepath, art_link):
@@ -92,7 +101,7 @@ def txtfile(self, txtfilepath, art_link):
92101
txtfilep.touch(exist_ok=True)
93102

94103
if not art_link in txtfilep.read_text():
95-
with open(str(txtfilep), 'a') as tfile:
104+
with open(str(txtfilep), "a") as tfile:
96105
tfile.write(str(art_link))
97106
tfile.write("\n")
98107

@@ -117,7 +126,7 @@ def download_article(self, art_title, art_link, art_min, art_date):
117126
text_date = text_art_date.strftime("%d_%b_%Y")
118127
textf_name = "PIB_LINKS_" + str(text_date) + ".txt"
119128
textf_path = Path(pib_links_path, str(textf_name)).expanduser()
120-
129+
121130
pdf_path = Path(min_path, art_title).expanduser()
122131
self.txtfile(str(textf_path), str(art_link))
123132
ops = {
@@ -144,5 +153,3 @@ def remove_html_entities(self, txt):
144153
str_html = html.unescape(str(txt))
145154
str_normalized = normalize("NFKD", str_html)
146155
return str(str_normalized)
147-
148-
Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import scrapy
2-
from scrapy_selenium import SeleniumRequest
32
from pathlib import Path
43
import requests
54
from datetime import datetime
@@ -8,31 +7,25 @@
87
import platform
98
import html
109
from unicodedata import normalize
10+
from scrapy import FormRequest
1111

1212
# url = 'https://archive.pib.gov.in/archive2/erelease.aspx/'
13-
url = "https://pib.gov.in/AllRelease.aspx"
14-
pib_url = "https://pib.gov.in/PressReleaseIframePage.aspx?PRID="
13+
url = "https://archive.pib.gov.in/archive2/erelease.aspx"
14+
pib_url = "https://archive.pib.gov.in/newsite/PrintRelease.aspx?relid="
1515
cwd = Path.cwd()
16-
chromedriver = "selenium/chromedriver"
17-
chromedriver_path = Path(cwd, chromedriver).expanduser()
1816
platform_release = str(platform.release())
1917
today = datetime.today()
2018

2119

2220
class PibSpider(scrapy.Spider):
23-
name = "pib_ddmin"
24-
allowed_domains = ["pib.gov.in"]
21+
name = "pib_archives"
22+
start_urls = ["https://archive.pib.gov.in/archive2/erelease.aspx"]
2523

26-
custom_settings = {
27-
"DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
28-
"SELENIUM_DRIVER_EXECUTABLE_PATH": str(chromedriver_path),
29-
}
30-
31-
def start_requests(self):
24+
def parse(self, response):
3225
# self.rel_date = self.rel_date_fn()
3326
self.strp_date = datetime.strptime(self.rel_date, "%Y-%m-%d")
3427
self.minis_code = self.rel_mincode
35-
28+
3629
if (
3730
self.strp_date.date() == today.date()
3831
and "azure" in platform_release.lower()
@@ -44,22 +37,23 @@ def start_requests(self):
4437
self.rel_month = self.strp_date.strftime("%m")
4538
self.rel_year = self.strp_date.strftime("%Y")
4639
self.pib_date = self.strp_date.strftime("%Y/%b/%d")
47-
self.jyr = f"document.forms.form1.ContentPlaceHolder1_ddlYear.value={str(self.rel_year).lstrip('0')};"
48-
self.jmin = f"document.forms.form1.ContentPlaceHolder1_ddlMinistry.value={str(self.minis_code)};"
49-
self.jday = f"document.forms.form1.ContentPlaceHolder1_ddlday.value={str(self.rel_day).lstrip('0')};"
50-
self.jmon = f"document.forms.form1.ContentPlaceHolder1_ddlMonth.value={str(self.rel_month).lstrip('0')};"
51-
self.submit = f"document.forms.form1.submit()"
52-
self.jsub = self.jmin + self.jday + self.jmon + self.jyr + self.submit
53-
yield SeleniumRequest(url=url, callback=self.parse_js, script=self.jsub)
40+
self.one = "1|"
41+
self.jyr = f"{str(self.rel_year).lstrip('0')}|"
42+
self.jmin = f"{str(self.minis_code)}"
43+
self.jday = f"{str(self.rel_day).lstrip('0')}|"
44+
self.jmon = f"{str(self.rel_month).lstrip('0')}|"
45+
self.jsub = self.one + self.jday + self.jmon + self.jyr + self.jmin
46+
pib_data = {"__CALLBACKID": "__Page", "__CALLBACKPARAM": str(self.jsub)}
47+
yield FormRequest.from_response(
48+
response, formdata=pib_data, callback=self.parse_asp
49+
)
5450

55-
def parse_js(self, response):
51+
def parse_asp(self, response):
5652
# for i in response.xpath("//div[contains(@class,'content-area')]/ul[contains(@class,'num')]"): #response.css("div.content-area ul.num"):
5753
# print(i.xpath("//h3").extract(),i.xpath("//li/a[contains(@href,'PRID')]").extract(),i.xpath("//h3/following-sibling").extract())
58-
for articles in response.xpath(
59-
"//div[contains(@class,'content-area')]/ul[contains(@class,'num')]/li/a[contains(@href,'PRID')]"
60-
):
61-
pib_prid = str(articles.xpath("@href").get()).split("=", 1)[1]
62-
pib_title_unnorm = str(articles.xpath("@title").get())[:90]
54+
for articles in response.xpath("//li[contains(@onclick,'Getrelease')]"):
55+
pib_prid = str(articles.xpath("@id").get())
56+
pib_title_unnorm = str(articles.xpath("text()").get())[:90]
6357
pib_title_norm = self.remove_html_entities(pib_title_unnorm)
6458
pib_title_un = (
6559
str(pib_title_norm)
@@ -73,7 +67,7 @@ def parse_js(self, response):
7367
pib_title = pib_title_re + "_" + str(pib_prid) + ".pdf"
7468

7569
pib_min_unnorm = str(
76-
articles.xpath("..//preceding-sibling::h3[1]/text()").get()
70+
articles.xpath("..//preceding-sibling::li[1]/text()").get()
7771
)
7872
pib_min_norm = self.remove_html_entities(pib_min_unnorm)
7973
pib_min_un = (
@@ -85,7 +79,7 @@ def parse_js(self, response):
8579
)
8680
pib_min = re.sub("[`~!@#$%^&*();:',.+=\"<>|\\/?\n\t\r ]", "", pib_min_un)
8781
pib_prlink = str(pib_url) + str(pib_prid)
88-
# print(self.pib_date,pib_min,pib_title,pib_prlink,sep="\n",end="\n\n\n")
82+
# print(self.pib_date, pib_min, pib_title, pib_prlink, sep="\n", end="\n\n\n")
8983
self.download_article(pib_title, pib_prlink, pib_min, self.pib_date)
9084

9185
def txtfile(self, txtfilepath, art_link):
@@ -94,7 +88,7 @@ def txtfile(self, txtfilepath, art_link):
9488
txtfilep.touch(exist_ok=True)
9589

9690
if not art_link in txtfilep.read_text():
97-
with open(str(txtfilep), 'a') as tfile:
91+
with open(str(txtfilep), "a") as tfile:
9892
tfile.write(str(art_link))
9993
tfile.write("\n")
10094

@@ -119,7 +113,7 @@ def download_article(self, art_title, art_link, art_min, art_date):
119113
text_date = text_art_date.strftime("%d_%b_%Y")
120114
textf_name = "PIB_LINKS_" + str(text_date) + ".txt"
121115
textf_path = Path(pib_links_path, str(textf_name)).expanduser()
122-
116+
123117
pdf_path = Path(min_path, art_title).expanduser()
124118
self.txtfile(str(textf_path), str(art_link))
125119
ops = {
@@ -146,5 +140,3 @@ def remove_html_entities(self, txt):
146140
str_html = html.unescape(str(txt))
147141
str_normalized = normalize("NFKD", str_html)
148142
return str(str_normalized)
149-
150-

0 commit comments

Comments
 (0)