nba_web_scraping3.py

## Our file to write web-scraping functions for a variety of sites that provide NFL statistics.

import requests  # pip install requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
import statistics as stat


def scrape_boxscore(date, home_id, away_id):
    """Scrapes the basketball_reference website for daily boxscores of NBA data. Parameters are:
       date in MM-DD-YYYY format (01/05/2021 would be 01052021,
       basket-reference ID of the home team,
       basket-reference ID of the away team."""

    # Sets up url to scrape and gets data for home and away boxscores
    date_triple = date.split('-')
    month = date_triple[0]
    day = date_triple[1]
    year = date_triple[2]
    url = 'https://www.basketball-reference.com/boxscores/' + year + month + day + '0' + home_id + '.html'
    df_away = pd.read_html(url)[0]
    df_home = pd.read_html(url)[8]
    df_list = [df_away, df_home]
    ids = [away_id, home_id]

    for home_away in range(2):
        df = df_list[home_away]

        # Deletes unwanted rows and joins 2 sub-dataframes into a single dataframe
        rows = list(df.index.values)
        last_row = rows[len(rows) - 1]
        df = df.drop([5, last_row])
        df = pd.concat([df['Unnamed: 0_level_0'], df['Basic Box Score Stats']],  axis=1, sort=False)

        # Creates and adds a "Starter" variable (1 if starter, 0 if reserve)
        starter = [0] * df.shape[0]
        for i in range(5):
            starter[i] = 1
        df.insert(1, 'Starter', starter)

        # Creates and adds a "Team" (Team ID) and "Home" (Home team or Away team)
        home = [home_away] * df.shape[0]
        team = [ids[home_away]] * df.shape[0]
        df.insert(1, 'Team', team)
        df.insert(2, 'Home', home)

        # Changes "MP" time format from string '00:00' to float 0.0. Removes row if player did not play.
        rows = list(df.index.values)
        for i in rows:
            if df['MP'][i] == 'Did Not Play':
                df = df.drop([i])
            else:
                col_index = df['MP'][i].find(':')
                df['MP'][i] = int(df['MP'][i][:col_index]) + int(df['MP'][i][(col_index+1):])/60

        # Changes values in the "+/-" column from strings to integers
        rows = list(df.index.values)
        for i in rows:
            if df['+/-'][i] == '0':
                df['+/-'][i] = 0
            elif df['+/-'][i][0] == '+':
                df['+/-'][i] = int(df['+/-'][i][1:])
            else:
                df['+/-'][i] = -1 * int(df['+/-'][i][1:])

        # Drops any column that relates to efficiency stats
        df.drop(['FG%', '3P%', 'FT%'], inplace=True, axis=1)

        print(df)

        boxscore_dir = r'/Users/jacobplata/Desktop/pp'
        dates = os.listdir(boxscore_dir)
        if date not in dates:
            os.mkdir(boxscore_dir + '/' + date)

        games = os.listdir(boxscore_dir + '/' + date)
        game_file = away_id + '_' + home_id + '.csv'
        if game_file not in games or home_away == 1:
            with open(boxscore_dir + '/' + date + '/' + away_id + '_' + home_id + '.csv', 'a') as f:
                if home_away == 0:
                    df.to_csv(f, header=True, index=False)
                else:
                    df.to_csv(f, header=False, index=False)


def scrape_all_boxscores(season):
    """Scrapes all the boxscore data from a single season in the NBA and stores the boxscores
       in folders by date (MONTH-DAY-YEAR) and
       as a csv file create as such: HOMEID_AWAYID.csv"""
    pass