import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.interpolate import CubicSpline
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#Load the dataframe from the provided CSV file
df = pd.read_csv('Nat_Gas.csv')
#Convert the Dates column to datetime datatypes
df['Dates'] = pd.to_datetime(df['Dates'], format = '%m/%d/%y')

#Plot the change in natural gas price over time
fig = px.line(df, x = 'Dates', y = 'Prices', template = 'plotly_white')
fig.update_layout(xaxis_title = 'Date', yaxis_title = 'Price', title = 'Natural Gas Market Price Over Time')
fig.update_traces(line_color='#669ce8')
fig.show()

fig = make_subplots(rows = 1, cols = 1)

#Plot monthly data
fig.add_trace(go.Scatter(x = df['Dates'], y = df['Prices'], mode = 'lines', name = 'Prices', line = {'color': '#669ce8'}))

#Calculate and plot 6-month rolling averages (seasonal)
df['SeasonTrend'] = df['Prices'].rolling(6, center = True).mean()
fig.add_trace(go.Scatter(x = df['Dates'], y = df['SeasonTrend'], mode = 'lines', name = 'Seasonal Trend', line = {'color': 'orange'}))

#Calculate and plot 12-month rolling averages (yearly trend)
df['YearTrend'] = df['Prices'].rolling(12, center = True).mean()
fig.add_trace(go.Scatter(x = df['Dates'], y = df['YearTrend'], mode = 'lines', name = 'Yearly Trend', line = {'color': 'green'}))

fig.update_layout(template = 'plotly_white', title = 'Natural Gas Prices and Trends')
fig.show()

#Create a Pandas series of all dates between the start and end of the dataset, inclusive
dates = pd.date_range(start = df['Dates'].min(), end = df['Dates'].max(), freq = 'D')

#Fetches the index of each purchase-price date in the Pandas date_range
date_indices = [i for i, date in enumerate(dates) if date in set(df['Dates'])]

#Fit a series of cubic polynomials between each pair of purchase prices
spl = CubicSpline(date_indices, df['Prices'])

#Creates a new dataframe with all dates and estimated purchase price values
df_inter = pd.DataFrame({'Dates': dates, 'Prices': spl(np.arange(0, len(dates), 1))})

#Plots the daily price estimates as a line plot and actual values as a scatterplot
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = df_inter['Dates'], y = df_inter['Prices'], mode = 'lines', name = 'Interpolated Price Values', line = {'color': '#669ce8'}))
fig.add_trace(go.Scatter(x = df['Dates'], y = df['Prices'], mode = 'markers', name = 'Actual Price Values', line = {'color': 'orange'}))
fig.update_layout(template = 'plotly_white', title = 'Interpolated vs Actual Price Values')
fig.show()

# Plot ACF and PACF for the differenced series
fig, axes = plt.subplots(1, 2, figsize=(16, 4))

# ACF plot
plot_acf(df['Prices'], lags=24, ax=axes[0])
axes[0].set_title('Autocorrelation Function (ACF)')

# PACF plot
plot_pacf(df['Prices'], lags=24, ax=axes[1])
axes[1].set_title('Partial Autocorrelation Function (PACF)')
plt.tight_layout()
plt.show()

# Perform the Augmented Dickey-Fuller test on the original series
result_original = adfuller(df["Prices"])
print("ADF Statistic (Original):", round(result_original[0], 4))
print("p-value (Original):", round(result_original[1], 4))

# Perform the Augmented Dickey-Fuller test on the differenced series
result_diff = adfuller(df["Prices"].diff().dropna())

print("\nADF Statistic (Differenced):", round(result_diff[0], 4))
print("p-value (Differenced):", round(result_diff[1], 4))

ADF Statistic (Original): 0.2181
p-value (Original): 0.9733

ADF Statistic (Differenced): -6.8448
p-value (Differenced): 0.0

np.random.seed(10)

# Split data into train and test
train_size = int(len(df) * 0.75)
train, test = df.iloc[:train_size], df.iloc[train_size:]

# Fit ARIMA model (order = p, d, q) using the training data
model = ARIMA(train["Prices"], order=(14,1,1))
model_fit = model.fit(method_kwargs={'maxiter':300})

# Forecast
forecast = model_fit.forecast(steps=len(test) * 2)

# Plot the results
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = train.index, y = train["Prices"], mode = 'lines', name = 'Train', line = {'color': '#669ce8'}))
fig.add_trace(go.Scatter(x = test.index, y = test["Prices"], mode = 'lines', name = 'Test', line = {'color': 'orange'}))
fig.add_trace(go.Scatter(x = forecast.index, y = forecast, mode = 'lines', name = 'Forecast', line = {'color': 'green'}))
fig.update_layout(xaxis_title = 'Date Index', yaxis_title = 'Purchase Price', title = 'Natural Gas Price Forecast', template = 'plotly_white')
fig.show()

#Calculate RMSE
rmse = np.sqrt(mean_squared_error(test["Prices"], forecast[:len(test)]))
print(f"RMSE: {rmse:.4f}")

RMSE: 0.2656

#Initializes a list of 12 dates, each the end of the month starting with October 2024
eom_dates = list(pd.date_range(start = df['Dates'].max(), periods = 13, freq = 'ME'))[1:]

#Creates a new dataframe with predicted price values and corresponding end-of-date dates
df_pred = pd.DataFrame({'Dates': eom_dates, 'Prices': forecast[-12:]})

#Concatenates the existing data with the predicted data
df_all = pd.concat([df[['Dates', 'Prices']], df_pred])

#Repeats the interpolation process described above, but with the predicted data included
#Create a Pandas series of all dates between the start and end of the new dataset, inclusive
dates = pd.date_range(start = df_all['Dates'].min(), end = df_all['Dates'].max(), freq = 'D')

#Fetches the index of each purchase-price date in the Pandas date_range
date_indices = [i for i, date in enumerate(dates) if date in set(df_all['Dates'])]

#Fit a series of cubic polynomials between each pair of purchase prices
spl = CubicSpline(date_indices, df_all['Prices'])

#Creates a new dataframe with all dates and estimated purchase price values
df_all_inter = pd.DataFrame({'Dates': dates, 'Prices': spl(np.arange(0, len(dates), 1))})

#Plots the daily price estimates as a line plot and actual values as a scatterplot
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = df_all_inter['Dates'], y = df_all_inter['Prices'], mode = 'lines', name = 'Interpolated Price Value', line = {'color': '#669ce8'}))
fig.add_trace(go.Scatter(x = df['Dates'], y = df['Prices'], mode = 'markers', name = 'Actual Price Value', line = {'color': 'orange'}))
fig.add_trace(go.Scatter(x = df_pred['Dates'], y = df_pred['Prices'], mode = 'markers', name = 'Predicted Price Value', line = {'color': 'green'}))
fig.update_layout(template = 'plotly_white', title = 'Interpolated, Actual, and Predicted Prices Over Time')
fig.show()

def get_price_estimate(date):
    try:
        estimate = df_all_inter[df_all_inter['Dates'] == date]['Prices'].values[0]
    except:
        estimate = -1
    return round(estimate, 2)

#Function to create objective function with constraints, run optimization, and optionally plot the results
def optimize_contract(inj_dates, wd_dates, inj_wd_rate, max_vol, storage_fee, plot_res):
    #Gets price estimates for natural gas on each injection/withdrawal date
    inj_cost = [get_price_estimate(date) for date in inj_dates]
    wd_revenue = [get_price_estimate(date) for date in wd_dates]

    #Defines optimization objective function (maximize profit)
    def obj(x):
        #Splits x array into injection and withdrawal amounts
        inj_amounts = x[:len(inj_cost)]
        wd_amounts = x[len(inj_cost):]

        #Costs = cost of purchasing quantities of natural gas + gas injection/withdrawal rate
        costs = np.dot(inj_cost, inj_amounts) + ((sum(inj_amounts) + sum(wd_amounts)) * inj_wd_rate)
        #Revenue = value of selling natural gas quantities
        revenue = np.dot(wd_revenue, wd_amounts)

        #Calculates cost of monthly storage fees
        #Creates a chronological timeline of all injection/withdrawal dates
        timeline = list(zip(inj_dates, inj_amounts)) + list(zip(wd_dates, -wd_amounts))
        timeline.sort(key = lambda i: i[0])

        #Tracks the current capacity
        current_capacity = timeline[0][1]

        #Iterates through each date after the first
        for i in range(1, len(timeline)):
            #If the capacity is nonzero, calculates number of months since the last date and multiplies by storage_fee
            if current_capacity > 1e-4:
                #Gets previous month
                prev_month = pd.to_datetime(timeline[i-1][0]).to_period('M')
                #Gets current month
                current_month = pd.to_datetime(timeline[i][0]).to_period('M')
                #Subtracts current month from previous to get total number of months
                n_months = (current_month - prev_month).n
                #Adds product of storage_fee and num_months to costs
                costs += (storage_fee * n_months)

            #Adds new amount to storage if injection, or subtracts if withdrawal
            current_capacity += timeline[i][1]

        #Returns profit (revenue - costs)
        #This value is negative to ensure the minimize function will maximize this value
        return -(revenue - costs)

    #Defines constraint that the value stored must be less than max_vol at all times
    def storage_constraint(x):
        #Splits x array into injection and withdrawal amounts
        inj_amounts = x[:len(inj_cost)]
        wd_amounts = x[len(inj_cost):]

        #Creates a chronological timeline of all injection/withdrawal dates
        timeline = list(zip(inj_dates, inj_amounts)) + list(zip(wd_dates, -wd_amounts))
        timeline.sort(key = lambda i: i[0])

        #Tracks the current capacity
        current_capacity = 0
        #Tracks available capacity and current capacity after each injection/withdrawal
        constraint_values = []
        
        for date, amount in timeline:
            #Adds if injection, subtracts if withdrawal
            current_capacity += amount
            #Current storage must be less than max_volume
            constraint_values.append(max_vol - current_capacity)
            #Current storage must be greater than 0
            constraint_values.append(current_capacity)

        #Returns all available capacity and current capacity values
        return np.array(constraint_values)

    #Initial guess for injection/withdrawal amounts
    x0 = np.zeros(len(inj_dates) + len(wd_dates))

    #Sets bounds between 0 and max_volume for each value
    b = (0, max_vol)
    bounds = [b] * (len(inj_dates) + len(wd_dates))

    #Adds the storage_constraint as an inequality (all values must be >= 0)
    storage_con = {'type': 'ineq', 'fun': storage_constraint}

    #Calculates optimal injection/withdrawal amounts with scipy minimize
    res = minimize(obj, x0, method='SLSQP', bounds = bounds, constraints = [storage_con])

    if(plot_res):
        #Extract injection and withdrawal amounts from optimization result ===
        inj_amounts = res.x[:len(inj_cost)]
        wd_amounts = res.x[len(inj_cost):]
        
        #Create timeline of all transactions
        timeline = list(zip(inj_dates, inj_amounts)) + list(zip(wd_dates, -wd_amounts))
        timeline.sort(key=lambda i: i[0])
        
        #Track storage capacity over time
        current_capacity = 0
        capacity_values = []
        dates = []
        
        for date, amount in timeline:
            current_capacity += amount
            capacity_values.append(round(current_capacity, 2))
            dates.append(pd.to_datetime(date))
        
        #Get price estimates
        inj_cost = [get_price_estimate(date) for date in inj_dates]
        wd_revenue = [get_price_estimate(date) for date in wd_dates]
        
        #Track profit over time
        profit = 0
        profit_dates = []
        profit_vals = []
        
        for i in range(len(timeline)):
            date = pd.to_datetime(timeline[i][0])
            #Gets the amount purchased/sold from the storage 
            amount = timeline[i][1]
            #Estimates the price of the quantity injected/withdrawn
            price = get_price_estimate(timeline[i][0])

            #Calculates and stores the profit on the current timeline date
            profit += (-1 * amount * price) - (inj_wd_rate * abs(amount))
            profit_dates.append(date)
            profit_vals.append(round(profit, 2))

            #If the quantity of natural gas stored is greater than 0, subtracts monthly storage fees from current profit
            if i < len(timeline) - 1 and amount > .01:
                next_date = pd.to_datetime(timeline[i + 1][0])
                months = pd.date_range(start=date, end=next_date, freq='ME')
                for m in months:
                    profit_dates.append(m)
                    profit -= storage_fee
                    profit_vals.append(profit)
        
        #Plot all 3 graphs
        fig = make_subplots(
            rows = 3, cols = 1, shared_xaxes = False,
            subplot_titles = ('Natural Gas Prices at Injection/Withdrawal', 'Storage Capacity Over Time', 'Profit Over Time')
        )
        
        #Subplot 1: Natural Gas Prices and Trade Dates
        fig.add_trace(go.Scatter(
            x = df_all_inter['Dates'], y = df_all_inter['Prices'], mode = 'lines', name = 'Prices', line = {'color': '#669ce8'}
        ), row = 1, col = 1)
        fig.add_trace(go.Scatter(x = inj_dates, y = inj_cost, mode = 'markers', name = 'Injection Date', marker = {'color': 'orange'}), row = 1, col = 1)
        fig.add_trace(go.Scatter(x = wd_dates, y = wd_revenue, mode = 'markers', name = 'Withdrawal Date', marker = {'color': 'green'}), row = 1, col = 1)
        
        #Subplot 2: Storage Capacity
        fig.add_trace(go.Scatter(x = dates, y = capacity_values, line = {'color': '#669ce8'}, showlegend=False), row = 2, col = 1)
        
        #Subplot 3: Profit Over Time
        fig.add_trace(go.Scatter(x = profit_dates, y = profit_vals, line = {'color': '#669ce8'}, showlegend=False), row = 3, col = 1)

        fig.update_layout(template = 'plotly_white', height = 600)
        fig.show()
    
    return res

inj_dates = ['2020-11-01', '2021-06-08', '2023-07-04']
wd_dates = ['2021-02-08', '2022-01-14', '2025-02-10']

res = optimize_contract(inj_dates, wd_dates, 0.01, 5000000, 10000, True)
print('The optimal value of the contract is $', abs(round(res.fun, 2)), sep = '')

The optimal value of the contract is $18372893.69

inj_dates = ['2021-02-08', '2022-01-14', '2025-02-10']
wd_dates = ['2020-11-01', '2021-06-08', '2023-07-04']

res = optimize_contract(inj_dates, wd_dates, 0.01, 5000000, 10000, True)
print('The optimal value of the contract is $', abs(round(res.fun, 2)), sep = '')

The optimal value of the contract is $0.0

Natural Gas Market Analytics

Price Interpolation and Forecasting

Purchase Price Interpolation

ARIMA Purchase Price Prediction

Putting It Together

Profit Optimization