Skip to content

Commit cf4850d

Browse files
committed
added readme file and validated flake8
1 parent fc1980c commit cf4850d

File tree

2 files changed

+112
-0
lines changed

2 files changed

+112
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Scrape News From HackerNews website
2+
3+
A script that scrapes a number of pages from HackerNews
4+
5+
6+
### How to run the script
7+
In command go to the file directory, and "run python main.py" in commandline
8+
9+
10+
## *Author Name*
11+
Developed by Javokhir
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import requests
2+
import os
3+
from multiprocessing import Pool, cpu_count
4+
from functools import partial
5+
from bs4 import BeautifulSoup, SoupStrainer
6+
# Makes Output Directory if it does not exist
7+
if not os.path.exists(os.path.join(os.getcwd(), 'HackerNews')):
8+
os.makedirs(os.path.join(os.getcwd(), 'HackerNews'))
9+
'''
10+
@params page_no: The page number of HackerNews to fetch.
11+
Adding only page number in order to add multiprocess support in future.
12+
@params verbose: Adds verbose output to screen instead
13+
of running the program silently.
14+
'''
15+
16+
17+
def fetch(page_no, verbose=False):
18+
# Should be unreachable, but just in case
19+
if page_no <= 0:
20+
raise ValueError('Number of Pages must be greater than zero')
21+
page_no = min(page_no, 20)
22+
i = page_no
23+
if verbose:
24+
print('Fetching Page {}...'.format(i))
25+
try:
26+
res = requests.get('https://news.ycombinator.com/?p=' + str(i))
27+
only_td = SoupStrainer('td')
28+
soup = BeautifulSoup(res.content, 'html.parser', parse_only=only_td)
29+
tdtitle = soup.find_all('td', attrs={'class': 'title'})
30+
tdmetrics = soup.find_all('td', attrs={'class': 'subtext'})
31+
with open(os.path.join('HackerNews', 'NewsPage{}.txt'.format(i)), 'w+') as f:
32+
f.write('-' * 80)
33+
f.write('\n')
34+
f.write('Page {}'.format(i))
35+
tdtitle = soup.find_all('td', attrs={'class': 'title'})
36+
tdrank = soup.find_all(
37+
'td',
38+
attrs={
39+
'class': 'title',
40+
'align': 'right'})
41+
tdtitleonly = [t for t in tdtitle if t not in tdrank]
42+
tdmetrics = soup.find_all('td', attrs={'class': 'subtext'})
43+
tdt = tdtitleonly
44+
tdr = tdrank
45+
tdm = tdmetrics
46+
num_iter = min(len(tdr), len(tdt))
47+
for idx in range(num_iter):
48+
f.write('\n' + '-' * 80 + '\n')
49+
rank = tdr[idx].find('span', attrs={'class': 'rank'})
50+
titl = tdt[idx].find('a', attrs={'class': 'storylink'})
51+
url = titl['href'] if titl and titl['href'].startswith(
52+
'https') else 'https://news.ycombinator.com/' + titl['href']
53+
site = tdt[idx].find('span', attrs={'class': 'sitestr'})
54+
score = tdm[idx].find('span', attrs={'class': 'score'})
55+
time = tdm[idx].find('span', attrs={'class': 'age'})
56+
author = tdm[idx].find('a', attrs={'class': 'hnuser'})
57+
f.write(
58+
'\nArticle Number: ' +
59+
rank.text.replace(
60+
'.',
61+
'') if rank else '\nArticle Number: Could not get article number')
62+
f.write(
63+
'\nArticle Title: ' +
64+
titl.text if titl else '\nArticle Title: Could not get article title')
65+
f.write(
66+
'\nSource Website: ' +
67+
site.text if site else '\nSource Website: https://news.ycombinator.com')
68+
f.write(
69+
'\nSource URL: ' +
70+
url if url else '\nSource URL: No URL found for this article')
71+
f.write(
72+
'\nArticle Author: ' +
73+
author.text if author else '\nArticle Author: Could not get article author')
74+
f.write(
75+
'\nArticle Score: ' +
76+
score.text if score else '\nArticle Score: Not Scored')
77+
f.write(
78+
'\nPosted: ' +
79+
time.text if time else '\nPosted: Could not find when the article was posted')
80+
f.write('\n' + '-' * 80 + '\n')
81+
except (requests.ConnectionError, requests.packages.urllib3.exceptions.ConnectionError) as e:
82+
print('Connection Failed for page {}'.format(i))
83+
except requests.RequestException as e:
84+
print("Some ambiguous Request Exception occurred. The exception is " + str(e))
85+
86+
87+
while(True):
88+
try:
89+
pages = int(
90+
input('Enter number of pages that you want the HackerNews for (max 20): '))
91+
v = input('Want verbose output y/[n] ?')
92+
verbose = v.lower().startswith('y')
93+
if pages > 20:
94+
print('A maximum of only 20 pages can be fetched')
95+
pages = min(pages, 20)
96+
for page_no in range(1, pages + 1):
97+
fetch(page_no, verbose)
98+
break
99+
except ValueError as e:
100+
print('\nInvalid input, probably not a positive integer\n')
101+
continue

0 commit comments

Comments
 (0)