Data source: https://www.kaggle.com/usdot/flight-delays#flights.csv
import pandas as pd
from datetime import datetime
from IPython.display import display
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns; sns.set()
import warnings
warnings.filterwarnings("ignore")
airlines_path = "airlines.csv"
airports_path = "airports.csv"
flights_path = "flights.csv"
airlines = pd.read_csv(airlines_path)
airports = pd.read_csv(airports_path)
flights = pd.read_csv(flights_path)
2.1. Airlines
display(airlines.head())
| IATA_CODE | AIRLINE | |
|---|---|---|
| 0 | UA | United Air Lines Inc. |
| 1 | AA | American Airlines Inc. |
| 2 | US | US Airways Inc. |
| 3 | F9 | Frontier Airlines Inc. |
| 4 | B6 | JetBlue Airways |
2.2. Airports
display(airports.head())
| IATA_CODE | AIRPORT | CITY | STATE | COUNTRY | LATITUDE | LONGITUDE | |
|---|---|---|---|---|---|---|---|
| 0 | ABE | Lehigh Valley International Airport | Allentown | PA | USA | 40.65236 | -75.44040 |
| 1 | ABI | Abilene Regional Airport | Abilene | TX | USA | 32.41132 | -99.68190 |
| 2 | ABQ | Albuquerque International Sunport | Albuquerque | NM | USA | 35.04022 | -106.60919 |
| 3 | ABR | Aberdeen Regional Airport | Aberdeen | SD | USA | 45.44906 | -98.42183 |
| 4 | ABY | Southwest Georgia Regional Airport | Albany | GA | USA | 31.53552 | -84.19447 |
2.3. Flights
display(flights.head())
| YEAR | MONTH | DAY | DAY_OF_WEEK | AIRLINE | FLIGHT_NUMBER | TAIL_NUMBER | ORIGIN_AIRPORT | DESTINATION_AIRPORT | SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | TAXI_OUT | WHEELS_OFF | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | WHEELS_ON | TAXI_IN | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | DIVERTED | CANCELLED | CANCELLATION_REASON | AIR_SYSTEM_DELAY | SECURITY_DELAY | AIRLINE_DELAY | LATE_AIRCRAFT_DELAY | WEATHER_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 1 | 1 | 4 | AS | 98 | N407AS | ANC | SEA | 5 | 2354.0 | -11.0 | 21.0 | 15.0 | 205.0 | 194.0 | 169.0 | 1448 | 404.0 | 4.0 | 430 | 408.0 | -22.0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2015 | 1 | 1 | 4 | AA | 2336 | N3KUAA | LAX | PBI | 10 | 2.0 | -8.0 | 12.0 | 14.0 | 280.0 | 279.0 | 263.0 | 2330 | 737.0 | 4.0 | 750 | 741.0 | -9.0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2015 | 1 | 1 | 4 | US | 840 | N171US | SFO | CLT | 20 | 18.0 | -2.0 | 16.0 | 34.0 | 286.0 | 293.0 | 266.0 | 2296 | 800.0 | 11.0 | 806 | 811.0 | 5.0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2015 | 1 | 1 | 4 | AA | 258 | N3HYAA | LAX | MIA | 20 | 15.0 | -5.0 | 15.0 | 30.0 | 285.0 | 281.0 | 258.0 | 2342 | 748.0 | 8.0 | 805 | 756.0 | -9.0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2015 | 1 | 1 | 4 | AS | 135 | N527AS | SEA | ANC | 25 | 24.0 | -1.0 | 11.0 | 35.0 | 235.0 | 215.0 | 199.0 | 1448 | 254.0 | 5.0 | 320 | 259.0 | -21.0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
1.1. Prepare data
# This cell calculate the number of Cancelled Flights and Delayed Flights each day
# Create a dataframe with cancellation and delay information each day
problems_per_day = flights.loc[:,['YEAR','MONTH','DAY','CANCELLED', 'DEPARTURE_DELAY']]
problems_per_day['Date'] = problems_per_day.apply(lambda x: datetime(int(x['YEAR']), int(x['MONTH']), int(x['DAY']))\
.strftime('%Y-%m-%d %a'), axis=1)
problems_per_day.drop(['YEAR','MONTH','DAY'], axis=1, inplace=True)
# Create a column specifying where a flight is delayed or not.
# When DEPARTURE_DELAY < 0, that flight is defined as a delayed flight
problems_per_day['DELAYED'] = problems_per_day.apply(lambda x: 1 if x['DEPARTURE_DELAY'] < 0 else 0, axis=1)
problems_per_day.drop(['DEPARTURE_DELAY'], axis=1, inplace=True)
# Calculate the number of Cancelled Flights and Delayed Flights each day
cancelled_delayed = problems_per_day.groupby(['Date'])['CANCELLED', 'DELAYED'].sum().reset_index()
display(cancelled_delayed.head())
| Date | CANCELLED | DELAYED | |
|---|---|---|---|
| 0 | 2015-01-01 Thu | 466 | 6801 |
| 1 | 2015-01-02 Fri | 257 | 7355 |
| 2 | 2015-01-03 Sat | 331 | 4888 |
| 3 | 2015-01-04 Sun | 433 | 4491 |
| 4 | 2015-01-05 Mon | 435 | 5885 |
1.2. Visualize data
# Use python package "plotly" for visualizing the interactive plots
fig = go.Figure()
# Line plot for the number of cancelled flights
fig.add_trace(
go.Scatter(name='Cancelled flights', x=cancelled_delayed['Date'], y=cancelled_delayed['CANCELLED'],
marker=dict(color='rgba(0,0,0,0.8)',)),
)
# Line plot for the number of delayed flights
fig.add_trace(
go.Scatter(name='Delayed flights', x=cancelled_delayed['Date'], y=cancelled_delayed['DELAYED'],
marker=dict(color='rgba(222,45,38,0.8)',))
)
# Set parameters for the plot
fig.update_xaxes(title_text="<b>Dates</b>", showticklabels=False)
fig.update_yaxes(title_text="<b>Number of flights</b>")
fig.update_layout(title={'text': "<b>Cancelled flights and Delayed flights per Day</b>",
'y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'},
autosize=False, width=1000,height=500,
)
fig.show()
COMMENT
1.1. Prepare data
# Calculate the number of Cancelled and Not-Cancelled Flights
airlinecode_cancel = pd.crosstab(flights['AIRLINE'], flights['CANCELLED'])\
.rename(columns={0: 'NotCancel', 1: 'Cancel'})
# Calculate the percentage of Cancelled Flights
airlinecode_cancel['PercentCancel'] = airlinecode_cancel.apply(lambda x: float(x['Cancel'])/(x['Cancel']+x['NotCancel']),axis=1)
airlinecode_cancel.reset_index(inplace=True)
airlinecode_cancel.rename(columns={'AIRLINE': 'IATA_CODE'}, inplace=True)
# Get name of airlines
airline_cancel = airlinecode_cancel.merge(airlines, left_on='IATA_CODE', right_on='IATA_CODE')
display(airline_cancel)
| IATA_CODE | NotCancel | Cancel | PercentCancel | AIRLINE | |
|---|---|---|---|---|---|
| 0 | AA | 715065 | 10919 | 0.015040 | American Airlines Inc. |
| 1 | AS | 171852 | 669 | 0.003878 | Alaska Airlines Inc. |
| 2 | B6 | 262772 | 4276 | 0.016012 | JetBlue Airways |
| 3 | DL | 872057 | 3824 | 0.004366 | Delta Air Lines Inc. |
| 4 | EV | 556746 | 15231 | 0.026629 | Atlantic Southeast Airlines |
| 5 | F9 | 90248 | 588 | 0.006473 | Frontier Airlines Inc. |
| 6 | HA | 76101 | 171 | 0.002242 | Hawaiian Airlines Inc. |
| 7 | MQ | 279607 | 15025 | 0.050996 | American Eagle Airlines Inc. |
| 8 | NK | 115375 | 2004 | 0.017073 | Spirit Air Lines |
| 9 | OO | 578393 | 9960 | 0.016929 | Skywest Airlines Inc. |
| 10 | UA | 509150 | 6573 | 0.012745 | United Air Lines Inc. |
| 11 | US | 194648 | 4067 | 0.020466 | US Airways Inc. |
| 12 | VX | 61369 | 534 | 0.008626 | Virgin America |
| 13 | WN | 1245812 | 16043 | 0.012714 | Southwest Airlines Co. |
1.2. Visualize data
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Bar plot for the number of flights which were not cancelled
fig.add_trace(
go.Bar(name='NotCancelled flights', x=airline_cancel['AIRLINE'], y=airline_cancel['NotCancel'],
marker=dict(color='rgb(49,130,189)',)),
secondary_y=False,
)
# Bar plot for the number of flights which were cancelled
fig.add_trace(
go.Bar(name='Cancelled flights', x=airline_cancel['AIRLINE'], y=airline_cancel['Cancel'],
marker=dict(color='rgba(222,45,38,0.8)',)),
secondary_y=False,
)
# Line plot for the percentage of cancelled flights
fig.add_trace(
go.Scatter(name='% of Cancelled flights', x=airline_cancel['AIRLINE'], y=airline_cancel['PercentCancel'],
marker=dict(color='rgba(0,0,0,0.8)',), mode='lines+markers'),
secondary_y=True,
)
fig.update_xaxes(title_text="<b>Airlines</b>")
fig.update_yaxes(title_text="<b>Number of flights</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Percentage of cancelled flights</b>", tickformat=".2%", showgrid=False, secondary_y=True)
fig.update_layout(barmode='stack', title={'text': "<b>Airlines: Number of flights and Ratio of Cancellation</b>",
'y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()
COMMENT
3.1. Prepare data
cancellation_reason = flights['CANCELLATION_REASON'].fillna('Others')\
.value_counts()\
.reset_index()\
.rename(columns={'index': 'Reason', 'CANCELLATION_REASON': 'Count'})
cancellation_reason.replace('A', 'Airline/Carrier', inplace=True)
cancellation_reason.replace('B', 'Weather', inplace=True)
cancellation_reason.replace('C', 'National Air System', inplace=True)
cancellation_reason.replace('D', 'Security', inplace=True)
display(cancellation_reason)
| Reason | Count | |
|---|---|---|
| 0 | Others | 5729195 |
| 1 | Weather | 48851 |
| 2 | Airline/Carrier | 25262 |
| 3 | National Air System | 15749 |
| 4 | Security | 22 |
3.2. Visualize data
fig = px.pie(cancellation_reason, values='Count', names='Reason', \
title={'text': "<b>Cancellation Reasons</b>",\
'y':0.05,'x':0.5,'xanchor': 'center','yanchor': 'bottom'})
fig.show()
COMMENT
4.1. Prepare data
airportscode_delay = flights.loc[:,['ORIGIN_AIRPORT','DESTINATION_AIRPORT','DEPARTURE_DELAY','ARRIVAL_DELAY']]
airportscode_delay.rename(columns={'ORIGIN_AIRPORT': 'ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT': 'DESTINATION_AIRPORT_CODE'},\
inplace=True)
display(airportscode_delay.head())
| ORIGIN_AIRPORT_CODE | DESTINATION_AIRPORT_CODE | DEPARTURE_DELAY | ARRIVAL_DELAY | |
|---|---|---|---|---|
| 0 | ANC | SEA | -11.0 | -22.0 |
| 1 | LAX | PBI | -8.0 | -9.0 |
| 2 | SFO | CLT | -2.0 | 5.0 |
| 3 | LAX | MIA | -5.0 | -9.0 |
| 4 | SEA | ANC | -1.0 | -21.0 |
# Calculate the mean of departure delay
airports_depart_df = airportscode_delay.groupby(['ORIGIN_AIRPORT_CODE'])['DEPARTURE_DELAY'].mean().reset_index()
airports_depart_df.rename(columns={'DEPARTURE_DELAY': 'Mean_Departure_Delay'}, inplace=True)
# Get name of departure airports
airports_depart = airports_depart_df.merge(airports[['IATA_CODE','AIRPORT']], \
left_on='ORIGIN_AIRPORT_CODE', right_on='IATA_CODE')
airports_depart.drop(columns=['ORIGIN_AIRPORT_CODE', 'IATA_CODE'], axis=1, inplace=True)
# Select top 10 airports which have the longest delay duration on departure
airports_delay = airports_depart.loc[(airports_depart['Mean_Departure_Delay'] < 0)].sort_values(by=['Mean_Departure_Delay'])
airports_delay.loc[:,'Mean_Departure_Delay'] *= -1
top10_departure_delay_airports = airports_delay.head(10)
display(top10_departure_delay_airports)
| Mean_Departure_Delay | AIRPORT | |
|---|---|---|
| 320 | 6.292237 | Yakutat Airport |
| 70 | 6.058537 | Canyonlands Field |
| 98 | 3.771760 | Elko Regional Airport |
| 314 | 3.740000 | Valdez Airport |
| 56 | 3.261868 | Merle K. (Mudhole) Smith Airport |
| 189 | 2.976190 | Lewiston-Nez Perce County Airport |
| 242 | 1.328221 | Pocatello Regional Airport |
| 309 | 0.895911 | Magic Valley Regional AirportĀ (Joslin Field) |
| 162 | 0.863947 | Hilo International Airport |
| 43 | 0.065292 | Brainerd Lakes Regional Airport |
fig = px.bar(top10_departure_delay_airports, x="Mean_Departure_Delay", y="AIRPORT", orientation='h')
fig.update_xaxes(title_text="<b>Mean time delay (in minutes) </b>")
fig.update_yaxes(title_text="<b>Airports</b>", autorange="reversed")
fig.update_layout(barmode='stack', title={'text': "<b>Top 10 Departure Airports have the Longest Time Delay</b>",
'y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()