import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output,callback

#Load the Dataset
olympics = pd.read_csv("athlete_events.csv")
region = pd.read_csv("noc_regions.csv")

#Joins the olympics and region dataset to plot teams by region they represent
olympics = olympics.join(region.set_index("NOC"), on="NOC").rename(columns={"region": "Country"})

#Data Cleaning
#Removes redundant information from the name column
olympics["Name"] = olympics["Name"].str.split("(").str.get(0)

#Drops the notes column
olympics = olympics.drop(columns = ["notes"])

#Converts Height from cm to ft
olympics["Height"] = round(olympics["Height"] / 30.48, 2)

#Converts Weight from kg to lbs
olympics["Weight"] = round(olympics["Weight"] * 2.205, 3)

#Outputs summary statistics
olympics.describe()

#Calculates unique event count in each year, separated by season
events_per_year = olympics.groupby(["Year", "Season"])["Event"].nunique().reset_index()

#Graphs the results
fig = px.line(events_per_year, x="Year", y="Event", color="Season", template = "plotly_white", title = "Number of Olympic Events per Year")
fig.update_layout(yaxis_title = "# of Events")
fig.show()

#Calculates how many unique countries in each olympics, grouped by year and season (Summer/Winter)
countries_per_year = olympics.groupby(["Year", "Season"])["Country"].nunique().reset_index()

#Graphs the results
fig = px.line(countries_per_year, x = "Year", y = "Country", color = "Season", template = "plotly_white",
             title = "A Growing International Community <br /><sub># of Countries Competing in Each Olympic Games (1896 - 2016)</sub>")
fig.update_layout(yaxis_title = "# of Countries")
fig.show()

#Country codes of all countries before the boycotts (1972)
countries_72 = pd.Index(olympics[olympics["Year"] == 1972]["NOC"].unique())
#Country codes of all countries boycotting in 1976 
boycott_76 = countries_72.difference(pd.Index(olympics[(olympics["Year"] == 1976) & (olympics["Season"] == "Summer")]["NOC"].unique()))
#Country codes of all countries boycotting in 1980
boycott_80 = countries_72.difference(pd.Index(olympics[(olympics["Year"] == 1980) & (olympics["Season"] == "Summer")]["NOC"].unique()))

#All country codes (used to set index)
countries = pd.Series(olympics[(olympics["Year"] >= 1972) & (olympics["Year"] <= 1980)]["NOC"].unique())

#0 = no boycotts, 1 = '76 boycott, 2 = '80 boycott, 3 = both
#IsIn is used to determine boycotting countries. These values are multiplied to differentiate and added for the "both" category
boycotts = countries.isin(pd.Series(boycott_76)) + (countries.isin(pd.Series(boycott_80)) * 2)
#Replaces the continuous 0-3 range with discrete values matching what the numbers represent
boycotts = boycotts.replace([0, 1, 2, 3], ["Neither", "1976", "1980", "Both"])

#Corrects country codes for certain countries (Soviet Union and Greece)
boycotts.index = countries.replace(["URS", "GRE"], ["RUS", "GRC"])
#Converts to a dataframe and renames columns
boycotts = boycotts.reset_index().rename(columns = {"index": "Country", 0: "Boycott Year"})

#Plots the choropleth
fig = px.choropleth(boycotts, locations = "Country", color = "Boycott Year", title = "Summer Olympic Boycotts",
                    category_orders = {"Boycott Year": ["1976", "1980", "Both", "Neither"]})
fig.show()

olympics["Basketball Player"] = olympics["Sport"] == "Basketball"

fig = px.scatter(olympics[(olympics["Medal"] == "Gold")], 
                 x="Weight", y="Height", title="Gold Medal Winner Height vs Weight", color="Basketball Player",
                trendline = "ols", trendline_scope = "overall", template = "plotly_white")
fig.update_layout(xaxis_title = "Weight (lbs)", yaxis_title = "Height (ft)")
fig.show()

fig = px.box(olympics[(olympics["Medal"] == "Gold") & (olympics["Sex"] == "M") & (olympics["Season"] == "Summer")], 
             x = "Year", y = "Age", title = "Male Gold Medal Winner Age over Time", template = "plotly_white")

#There are gaps in the years due to WWI and WWII. This makes the x-axis categorical to remove the gaps when plotting
fig.update_xaxes(type='category', categoryorder='category ascending')
fig.show()

fig = px.box(olympics[(olympics["Medal"] == "Gold") & (olympics["Sex"] == "F") & (olympics["Season"] == "Summer")], 
             x = "Year", y = "Age", title = "Female Gold Medal Winner Age over Time", 
            color_discrete_sequence=px.colors.qualitative.Set1, template = "plotly_white")

fig.update_xaxes(type='category', categoryorder='category ascending')
fig.show()

sex_ratio = olympics[olympics["Season"] == "Summer"].groupby(["Year", "Sex"])["ID"].count() 
sex_ratio = sex_ratio / olympics[olympics["Season"] == "Summer"].groupby("Year")["ID"].count()
sex_ratio = sex_ratio.reset_index().rename(columns = {"ID": "Ratio"})

fig = px.bar(sex_ratio, x = "Year", y = "Ratio", color = "Sex", template = "plotly_white",
            title = "Olympics Gender Representation <br /><sub>Proportion of Athletes by Gender</sub>")
fig.update_xaxes(type='category', categoryorder='category ascending')
fig.update_layout(yaxis_title = "Percent of Athletes")

fig.show()

sex_count = olympics[olympics["Season"] == "Summer"].groupby(["Year", "Sex"])["ID"].count().reset_index()
sex_count = sex_count.rename(columns = {"ID": "# of Athletes"})

fig = px.line(sex_count, x = "Year", y = "# of Athletes", color = "Sex", template = "plotly_white", 
             title = "Olympics Gender Representation<br /><sub># of Athletes by Sex at Each Summer Olympic Games</sub>")
fig.show()

gold_medals_age = olympics[olympics["Medal"] == "Gold"].pivot_table(index = "Age", values = "ID", aggfunc = "count")
gold_medals_age = gold_medals_age.reset_index().rename(columns = {"ID": "Medals"})

fig = px.bar(gold_medals_age, x = "Age", y = "Medals", template = "plotly_white", title = "Gold Medals by Age")
fig.show()

#Uses pivot tables to get count of medals for each olympic athlete who has won at least one
#First, the dataset is filtered to include only medal winners ('Medal' col not NA)
#Next, a pivot table is created that counts the distinct medals for each athlete's events, using name as index, medal as column, and event counts as values
#Any null values (medals a given athlete has not won) are replaced with 0
#Lastly, the margins parameter is set to true to calculate each Olympian's total medal count
athlete_medals = olympics[olympics["Medal"].notna()].pivot_table(values="ID", index="Name", columns="Medal", aggfunc="count", fill_value=0, margins=True)

#Changes column order from alphabetical to place-order ascending
athlete_medals = athlete_medals[["Bronze", "Silver", "Gold", "All"]]

#This code removes the column totals, as I focus only on each athlete's total number of medals
athlete_medals = athlete_medals.iloc[:-1]

#Number of top athletes to graph. Can be adjusted
n = 20

#USes pandas nlargest to get the n largest medal counts
top_n = athlete_medals.nlargest(n, "All")

#Plots the medal counts as a stacked bar chart, with the correct colors for each medal type
fig = px.bar(top_n[["Bronze", "Silver", "Gold"]], 
             title = "Medal Counts of The Top " + str(n) + " Olympic Athletes",
             labels = {"Name": "Name", "value": "Count"}, template = "plotly_white",
             color_discrete_map={'Bronze': 'darkgoldenrod', 'Silver': 'silver', 'Gold': 'gold'})

#Displays the total number of medals each athlete has earned above their respective bar
fig.add_trace(go.Scatter(
    x=top_n.index, 
    y=top_n['All'],
    text=top_n['All'],
    mode='text',
    textposition='top center',
    textfont=dict(
        size=18,
    ),
    showlegend=False
))

#Adjusts the height of the graph so the text is not cut-off
fig.update_layout(yaxis_range=[0,40])

fig.show()

top_n_gold = athlete_medals.nlargest(n, "Gold")["Gold"].reset_index().rename(columns = {"Gold": "Medal Count"})

fig = px.bar(top_n_gold, y = "Medal Count", x = "Name", template = "plotly_white",
            title = "Top 10 Gold Medal Winners")
fig.show()

country_medals = olympics[olympics["Medal"].notna()].pivot_table(values="Event", index=["Country", "NOC", "Season", "Year", "Sport", "Medal"], aggfunc="nunique").reset_index()
country_medals = country_medals.rename(columns={"Event": "Medal Count"})

team_counts = country_medals["Country"].value_counts()
teams_to_replace = team_counts[team_counts < 20].index
country_medals["Country"] = country_medals["Country"].apply(lambda x: "Other" if x in teams_to_replace else x)

fig = px.treemap(country_medals, path=[px.Constant("all"), "Country", "Season", "Sport", "Medal"], values="Medal Count",
                title = "Medal Counts by Country",)
fig.show()

sport = "Ice Hockey"
medals_by_year = olympics[(olympics["Sport"] == sport) & (olympics["Medal"].notna())].pivot_table(values="Event", index="Year", columns="Country", aggfunc="nunique", fill_value=0).cumsum().melt(ignore_index=False).reset_index()

fig = px.line(medals_by_year, x="Year", y="value", color="Country", template = "plotly_white",
             title = "Rivals on Ice<br /><sup>" + sport + " Medal Counts by Country by Year</sup>")

fig.update_layout(yaxis_title="Medal Count")
fig.show()

#This function returns a line chart of cumulative medal counts by country in a given sport
def plot_sport(sport):
    medals_by_year = olympics[(olympics["Sport"] == sport) & (olympics["Medal"].notna())].pivot_table(values="Event", index="Year", columns="Country", aggfunc="nunique", fill_value=0).cumsum().melt(ignore_index=False).reset_index()
    fig = px.line(medals_by_year, x="Year", y="value", color="Country", template = "plotly_white",
                 title = sport + " Medal Counts by Country by Year")

    fig.update_layout(yaxis_title="Medal Count")
    
    return fig

#Initializes the application
app = Dash()
#Fetches all sports from the olympics dataset and sorts them alphabetically
sports = olympics["Sport"].unique()
sports.sort()

#Sets the layout of the dashboard (title, select dropdown, graph)
app.layout = [
    html.H1(children='Olympic Medal Tracker', style={'textAlign':'center', 'color':'white'}),
    dcc.Dropdown(sports, 'Athletics', id='sport-selection'),
    dcc.Graph(id='graph-content')
]

#Sets the app input variables (the selected value) and the output (figure)
@callback(
    Output('graph-content', 'figure'),
    Input('sport-selection', 'value')
)

#Returns a new figure when updating the graph
def update_graph(value):
    return plot_sport(value)

#Runs the Dash app
# if __name__ == '__main__':
#     app.run(debug=True)

import requests
from bs4 import BeautifulSoup

#Fetch the HTML from the given URL
url = "https://architectureofthegames.net/olympic-host-cities/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

#Find all list items on the page
host_list = soup.find("main").find_all("li")

#Tracks the current season list
season = " Summer"
#Initializes an empty dictionary to store Games: Host KV-pairs
host_dict = {}

#For each list item
for host in host_list:
    #Ignores cancelled games
    if "Cancelled" not in host.text:
        #Splits by space once to separate the year from the city/country/extra info
        row = host.text.strip().split(" ", 1)

        #Stores the year in a variable
        year = row[0].strip()
        #Separates the city and country
        city_country = [i.strip() for i in row[1].split(",")]

        #Stores the host city in a variable
        city = city_country[0]
        #Stores the host country in a variable
        country = city_country[1]

        #Removes 'postponed' from the 2020 Tokyo Olympics
        if "(" in country:
            country = country[0: country.index("(") - 1]

        #Switches the season to "Winter" at the start of the "Winter" list
        if year == "1924" and city == "Chamonix":
            season = " Winter"

        #Combines year and season to create an index matching the olympics["Games"] column
        games = year + season
        #Creates a new key-value pair, Games: Host Country
        host_dict[games] = country

#Converts the scraped data to a dataframe, with Games as index and host country as a column
host_df = pd.DataFrame.from_dict(host_dict, orient = "index", columns = ["Host"])

#Country names to overwrite and their replacements
#Ensures country names match the olympics["Country"] column
overwrite = ["England", "United States", "Yugoslavia", "West Germany", "Soviet Union"]
replace = ["UK", "USA", "Bosnia and Herzegovina", "Germany", "Russia"]

host_df["Host"] = host_df["Host"].replace(overwrite, replace)

#Joins the datAframes on the Games column
olympics = olympics.join(host_df, on="Games")

#Creates a new column, which is true if a given athlete's country is hosting the current Olympic Games
olympics["IsHost"] = olympics["Country"] == olympics["Host"]

#Displays the new columns
olympics[["Year", "Name", "Country", "Host", "IsHost"]].head()

#Counts medals for each year, separating by country and whether or not the country is hosting
#I check whether the year is divisible by 4 to filter out the 1906 Intercalated Games, which is not recognized by the IOC
host_medals = olympics[(olympics["Medal"].notna()) & (olympics["Season"] == "Summer") & (olympics["Year"] % 4 == 0)].groupby(["Year", "Country", "IsHost"])["Medal"].count().reset_index()

#Plots the data in a scatterplot
fig = px.scatter(host_medals.rename(columns={"IsHost": "Host Country"}), x = "Year", y = "Medal", color = "Host Country",
                title = "Does Hosting The Olympics Improve Performance? <br /><sub>Medal Counts by Country</sub>", 
                template = "plotly_white",  category_orders={"Host Country": [True, False]},
                color_discrete_sequence=px.colors.qualitative.Set1)

fig.update_xaxes(type='category', categoryorder='category ascending')
fig.update_layout(yaxis_title = "Medal Count")
fig.show()

us_gold = olympics[(olympics["Country"] == "USA") & (olympics["Medal"] == "Gold") & (olympics["Season"] == "Summer")].groupby(["Year", "IsHost"])
us_gold = us_gold["Event"].nunique().reset_index().rename(columns = {"IsHost": "Host Country"})

fig = px.bar(us_gold, x = "Year", y = "Event", color = "Host Country", category_orders={"Host Country": [True, False]},
             title = "US Gold Medal Counts",template = "plotly_white",
             color_discrete_sequence=px.colors.qualitative.Set1)

fig.update_xaxes(type='category', categoryorder='category ascending')
fig.update_layout(yaxis_title = "# of Gold Medals")

	ID	Age	Height	Weight	Year
count	271116.000000	261642.000000	210945.000000	208241.000000	271116.000000
mean	68248.954396	25.556898	5.752586	155.898777	1978.378480
std	39022.286345	6.393561	0.345256	31.637384	29.877632
min	1.000000	10.000000	4.170000	55.125000	1896.000000
25%	34643.000000	21.000000	5.510000	132.300000	1960.000000
50%	68205.000000	24.000000	5.740000	154.350000	1988.000000
75%	102097.250000	28.000000	6.000000	174.195000	2002.000000
max	135571.000000	97.000000	7.410000	471.870000	2016.000000

	Year	Name	Country	Host	IsHost
0	1992	A Dijiang	China	Spain	False
1	2012	A Lamusi	China	UK	False
2	1920	Gunnar Nielsen Aaby	Denmark	Belgium	False
3	1900	Edgar Lindenau Aabye	Denmark	France	False
4	1988	Christine Jacoba Aaftink	Netherlands	Canada	False

120 Years of Olympic History

Overview

Data

Content

Module Imports

Data Loading/Cleaning

Growth of the Games

Athlete Demographics

Top Olympians Medal Counts

Medal Counts by Country

Event Medal Counts

Does Hosting Improve Performance?