|
| 1 | +import requests |
| 2 | +import os |
| 3 | +from multiprocessing import Pool, cpu_count |
| 4 | +from functools import partial |
| 5 | +from bs4 import BeautifulSoup, SoupStrainer |
| 6 | +# Makes Output Directory if it does not exist |
| 7 | +if not os.path.exists(os.path.join(os.getcwd(), 'HackerNews')): |
| 8 | + os.makedirs(os.path.join(os.getcwd(), 'HackerNews')) |
| 9 | +''' |
| 10 | +@params page_no: The page number of HackerNews to fetch. |
| 11 | +Adding only page number in order to add multiprocess support in future. |
| 12 | +@params verbose: Adds verbose output to screen instead |
| 13 | +of running the program silently. |
| 14 | +''' |
| 15 | + |
| 16 | + |
| 17 | +def fetch(page_no, verbose=False): |
| 18 | + # Should be unreachable, but just in case |
| 19 | + if page_no <= 0: |
| 20 | + raise ValueError('Number of Pages must be greater than zero') |
| 21 | + page_no = min(page_no, 20) |
| 22 | + i = page_no |
| 23 | + if verbose: |
| 24 | + print('Fetching Page {}...'.format(i)) |
| 25 | + try: |
| 26 | + res = requests.get('https://news.ycombinator.com/?p=' + str(i)) |
| 27 | + only_td = SoupStrainer('td') |
| 28 | + soup = BeautifulSoup(res.content, 'html.parser', parse_only=only_td) |
| 29 | + tdtitle = soup.find_all('td', attrs={'class': 'title'}) |
| 30 | + tdmetrics = soup.find_all('td', attrs={'class': 'subtext'}) |
| 31 | + with open(os.path.join('HackerNews', 'NewsPage{}.txt'.format(i)), 'w+') as f: |
| 32 | + f.write('-' * 80) |
| 33 | + f.write('\n') |
| 34 | + f.write('Page {}'.format(i)) |
| 35 | + tdtitle = soup.find_all('td', attrs={'class': 'title'}) |
| 36 | + tdrank = soup.find_all( |
| 37 | + 'td', |
| 38 | + attrs={ |
| 39 | + 'class': 'title', |
| 40 | + 'align': 'right'}) |
| 41 | + tdtitleonly = [t for t in tdtitle if t not in tdrank] |
| 42 | + tdmetrics = soup.find_all('td', attrs={'class': 'subtext'}) |
| 43 | + tdt = tdtitleonly |
| 44 | + tdr = tdrank |
| 45 | + tdm = tdmetrics |
| 46 | + num_iter = min(len(tdr), len(tdt)) |
| 47 | + for idx in range(num_iter): |
| 48 | + f.write('\n' + '-' * 80 + '\n') |
| 49 | + rank = tdr[idx].find('span', attrs={'class': 'rank'}) |
| 50 | + titl = tdt[idx].find('a', attrs={'class': 'storylink'}) |
| 51 | + url = titl['href'] if titl and titl['href'].startswith( |
| 52 | + 'https') else 'https://news.ycombinator.com/' + titl['href'] |
| 53 | + site = tdt[idx].find('span', attrs={'class': 'sitestr'}) |
| 54 | + score = tdm[idx].find('span', attrs={'class': 'score'}) |
| 55 | + time = tdm[idx].find('span', attrs={'class': 'age'}) |
| 56 | + author = tdm[idx].find('a', attrs={'class': 'hnuser'}) |
| 57 | + f.write( |
| 58 | + '\nArticle Number: ' + |
| 59 | + rank.text.replace( |
| 60 | + '.', |
| 61 | + '') if rank else '\nArticle Number: Could not get article number') |
| 62 | + f.write( |
| 63 | + '\nArticle Title: ' + |
| 64 | + titl.text if titl else '\nArticle Title: Could not get article title') |
| 65 | + f.write( |
| 66 | + '\nSource Website: ' + |
| 67 | + site.text if site else '\nSource Website: https://news.ycombinator.com') |
| 68 | + f.write( |
| 69 | + '\nSource URL: ' + |
| 70 | + url if url else '\nSource URL: No URL found for this article') |
| 71 | + f.write( |
| 72 | + '\nArticle Author: ' + |
| 73 | + author.text if author else '\nArticle Author: Could not get article author') |
| 74 | + f.write( |
| 75 | + '\nArticle Score: ' + |
| 76 | + score.text if score else '\nArticle Score: Not Scored') |
| 77 | + f.write( |
| 78 | + '\nPosted: ' + |
| 79 | + time.text if time else '\nPosted: Could not find when the article was posted') |
| 80 | + f.write('\n' + '-' * 80 + '\n') |
| 81 | + except (requests.ConnectionError, requests.packages.urllib3.exceptions.ConnectionError) as e: |
| 82 | + print('Connection Failed for page {}'.format(i)) |
| 83 | + except requests.RequestException as e: |
| 84 | + print("Some ambiguous Request Exception occurred. The exception is " + str(e)) |
| 85 | + |
| 86 | + |
| 87 | +while(True): |
| 88 | + try: |
| 89 | + pages = int( |
| 90 | + input('Enter number of pages that you want the HackerNews for (max 20): ')) |
| 91 | + v = input('Want verbose output y/[n] ?') |
| 92 | + verbose = v.lower().startswith('y') |
| 93 | + if pages > 20: |
| 94 | + print('A maximum of only 20 pages can be fetched') |
| 95 | + pages = min(pages, 20) |
| 96 | + for page_no in range(1, pages + 1): |
| 97 | + fetch(page_no, verbose) |
| 98 | + break |
| 99 | + except ValueError as e: |
| 100 | + print('\nInvalid input, probably not a positive integer\n') |
| 101 | + continue |
0 commit comments