Best Pizza in Boston

Purpose: Kind of nerdy but I’ve always loved Master Splinter and his obsession with pizza. Spent a weekend in Boston and asked around for the best place with mixed results. Ended up creating a script which scraped all the data from Yelp including the name, number of reviews, and average rating. Did some Bayesian statistics to normalize values and eliminate noise. Beautiful city.

  1. import requests

  2. import re

  3. from bs4 import BeautifulSoup

  4. import pandas as pd

  5. import numpy as np

  6. # Define the base URL and the number of pages, scrape proxy

  7. access_key = '3765bb2ae3b776bb4dcd5d01b77a7585'

  8. base_url = 'http://api.scrapestack.com/'

  9. endpoint = 'scrape'

  10. url_to_scrape = "https://www.yelp.com/search?find_desc=Pizza&find_loc=Boston%2C+MA&start="

  11. df = pd.DataFrame(columns=['Restaurant Name', 'Number of Reviews', 'Rating', 'Bayesian'])

  12. name_data, no_ratings, avg_rating = [], [], [],

  13. # Loop through the pages

  14. for page_num in range(0, 230, 10):

  15. api_url = f'{base_url}{endpoint}?access_key={access_key}&url={url_to_scrape + str(page_num)}'

  16. # Send a GET request to retrieve the page content

  17. r = requests.get(url_to_scrape + str(page_num))

  18. # Process the content

  19. content = r.text

  20. # Create the soup

  21. soup = BeautifulSoup(content, 'html.parser')

  22. # Pull name data

  23. nameList = soup.find_all('a', class_='css-19v1rkv', attrs={'name': True})

  24. # Iterate through the soup, placing the elements into the empty list

  25. for name in [element['name'] for element in nameList]:

  26. modified_name = name.replace('%20', ' ')

  27. name_data.append(modified_name)

  28. # Pull the ratings

  29. pattern = r'^\d+$'

  30. rateList = soup.find_all('span', class_='css-chan6m', string=re.compile(pattern))

  31. # Iterate through the ratings

  32. for rate in rateList:

  33. rate = int(rate.text)

  34. no_ratings.append(rate)

  35. # Pull the avgRating

  36. avgRating = soup.find_all('div', attrs={"aria-label": True, "role": 'img'})

  37. # Iterate through the avg rating

  38. for rate in avgRating:

  39. rating = rate['aria-label']

  40. if rating[1:2] == '.':

  41. rating = rating[0:3]

  42. else:

  43. rating = rating[0]

  44. avg_rating.append(float(rating))

  45. df['Restaurant Name'] = name_data

  46. df['Number of Reviews'] = no_ratings

  47. df['Rating'] = avg_rating

  48. m = np.mean(avg_rating)

  49. C = 60 # Take the 60 Percentile

  50. avg_Bayes = []

  51. for i in range(len(name_data)):

  52. avg_Bayes.append((((avg_rating[i] * no_ratings[i]) + C * m)) / (no_ratings[i]+ C))

  53. df['Bayesian'] = avg_Bayes

  54. sorted_df = df.sort_values(by='Bayesian', ascending=False)

  55. first_20 = sorted_df.head(20)

  56. print(first_20)

Conclusion: “Ciao! Pizza & Pasta” is top-rated

Previous
Previous

Multi-Component Flash Evaporation Calculator