Best Pizza in Boston
Purpose: Kind of nerdy but I’ve always loved Master Splinter and his obsession with pizza. Spent a weekend in Boston and asked around for the best place with mixed results. Ended up creating a script which scraped all the data from Yelp including the name, number of reviews, and average rating. Did some Bayesian statistics to normalize values and eliminate noise. Beautiful city.
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
# Define the base URL and the number of pages, scrape proxy
access_key = '3765bb2ae3b776bb4dcd5d01b77a7585'
base_url = 'http://api.scrapestack.com/'
endpoint = 'scrape'
url_to_scrape = "https://www.yelp.com/search?find_desc=Pizza&find_loc=Boston%2C+MA&start="
df = pd.DataFrame(columns=['Restaurant Name', 'Number of Reviews', 'Rating', 'Bayesian'])
name_data, no_ratings, avg_rating = [], [], [],
# Loop through the pages
for page_num in range(0, 230, 10):
api_url = f'{base_url}{endpoint}?access_key={access_key}&url={url_to_scrape + str(page_num)}'
# Send a GET request to retrieve the page content
r = requests.get(url_to_scrape + str(page_num))
# Process the content
content = r.text
# Create the soup
soup = BeautifulSoup(content, 'html.parser')
# Pull name data
nameList = soup.find_all('a', class_='css-19v1rkv', attrs={'name': True})
# Iterate through the soup, placing the elements into the empty list
for name in [element['name'] for element in nameList]:
modified_name = name.replace('%20', ' ')
name_data.append(modified_name)
# Pull the ratings
pattern = r'^\d+$'
rateList = soup.find_all('span', class_='css-chan6m', string=re.compile(pattern))
# Iterate through the ratings
for rate in rateList:
rate = int(rate.text)
no_ratings.append(rate)
# Pull the avgRating
avgRating = soup.find_all('div', attrs={"aria-label": True, "role": 'img'})
# Iterate through the avg rating
for rate in avgRating:
rating = rate['aria-label']
if rating[1:2] == '.':
rating = rating[0:3]
else:
rating = rating[0]
avg_rating.append(float(rating))
df['Restaurant Name'] = name_data
df['Number of Reviews'] = no_ratings
df['Rating'] = avg_rating
m = np.mean(avg_rating)
C = 60 # Take the 60 Percentile
avg_Bayes = []
for i in range(len(name_data)):
avg_Bayes.append((((avg_rating[i] * no_ratings[i]) + C * m)) / (no_ratings[i]+ C))
df['Bayesian'] = avg_Bayes
sorted_df = df.sort_values(by='Bayesian', ascending=False)
first_20 = sorted_df.head(20)
print(first_20)
Conclusion: “Ciao! Pizza & Pasta” is top-rated