In [1]:
from google.cloud.bigquery import Client, QueryJobConfig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
client = Client()
In [2]:
sql_query = "SELECT * FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` LIMIT 10"
bigquery_response = client.query(sql_query)
df = bigquery_response.to_dataframe()
In [3]:
df.head()
Out[3]:
borough | contributing_factor_vehicle_1 | contributing_factor_vehicle_2 | contributing_factor_vehicle_3 | contributing_factor_vehicle_4 | contributing_factor_vehicle_5 | cross_street_name | timestamp | latitude | longitude | ... | number_of_persons_killed | off_street_name | on_street_name | unique_key | vehicle_type_code1 | vehicle_type_code2 | vehicle_type_code_3 | vehicle_type_code_4 | vehicle_type_code_5 | zip_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | Fatigued/Drowsy | Fatigued/Drowsy | Unspecified | None | None | None | 2014-04-10 22:00:00 | NaN | NaN | ... | 0 | None | None | 3090695 | PASSENGER VEHICLE | PASSENGER VEHICLE | VAN | None | None | <NA> |
1 | None | Other Electronic Device | Other Electronic Device | None | None | None | JAY STREET | 2014-04-07 23:21:00 | NaN | NaN | ... | 0 | None | SANDS STREET | 317210 | SPORT UTILITY / STATION WAGON | SPORT UTILITY / STATION WAGON | None | None | None | <NA> |
2 | None | Unspecified | None | None | None | None | EAST 153 STREET | 2014-05-03 17:53:00 | NaN | NaN | ... | 0 | None | 3 AVENUE | 330832 | UNKNOWN | None | None | None | None | <NA> |
3 | None | Unspecified | None | None | None | None | None | 2014-03-18 16:10:00 | NaN | NaN | ... | 0 | PARKING LOT 82-07 153 AVENUE | None | 308098 | PASSENGER VEHICLE | PASSENGER VEHICLE | None | None | None | <NA> |
4 | None | Driver Inattention/Distraction | Unspecified | None | None | None | ELDERTS LANE | 2014-03-22 18:43:00 | NaN | NaN | ... | 0 | None | CONDUIT BOULEVARD | 309477 | PASSENGER VEHICLE | PASSENGER VEHICLE | None | None | None | <NA> |
5 rows × 28 columns
Boroughs by number of collisions¶
In [4]:
sql_query = " \
SELECT borough, COUNT(*) AS number_of_collisions \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
GROUP BY borough \
ORDER BY number_of_collisions DESC \
"
bigquery_response = client.query(sql_query)
collisions_by_borough = bigquery_response.to_dataframe()
In [5]:
collisions_by_borough.drop(index=0, inplace=True)
In [6]:
collisions_by_borough
Out[6]:
borough | number_of_collisions | |
---|---|---|
1 | BROOKLYN | 453525 |
2 | QUEENS | 382467 |
3 | MANHATTAN | 319677 |
4 | BRONX | 210812 |
5 | STATEN ISLAND | 59876 |
In [7]:
plt.figure(figsize=(5, 3))
plt.bar(x=collisions_by_borough["borough"],
height=collisions_by_borough["number_of_collisions"],
width=0.4, color="red", alpha=0.7, edgecolor="black")
plt.title("Collisions by Borough")
plt.xlabel("Borough")
plt.ylabel("Number of Collisions")
plt.grid(True)
plt.show()
In [8]:
### Which zipcodes are the most dangerous?
In [9]:
sql_query = " \
SELECT zip_code, COUNT(*) AS number_of_collisions \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
GROUP BY zip_code \
ORDER BY number_of_collisions DESC \
"
bigquery_response = client.query(sql_query)
collisions_by_zipcode = bigquery_response.to_dataframe()
In [10]:
collisions_by_zipcode.drop(index=0, inplace=True)
In [11]:
collisions_by_zipcode["zip_code"] = collisions_by_zipcode["zip_code"].astype(np.string_)
In [12]:
collisions_by_zipcode.head()
Out[12]:
zip_code | number_of_collisions | |
---|---|---|
1 | b'11207' | 27710 |
2 | b'11236' | 19212 |
3 | b'11101' | 19170 |
4 | b'11203' | 18316 |
5 | b'11234' | 17967 |
In [13]:
plt.figure(figsize=(10, 4))
plt.bar(x=collisions_by_zipcode[:30]["zip_code"],
height=collisions_by_zipcode[:30]["number_of_collisions"],
width=0.3, color="red", edgecolor="black")
plt.xticks(rotation=90)
plt.title("Collisions by Zip Code")
plt.xlabel("Zip Codes")
plt.ylabel("Number of Collisions")
plt.grid(True)
plt.show()
Let's scatter plot the longitudes and latitudes¶
In [14]:
sql_query = " \
SELECT longitude, latitude \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
WHERE (longitude IS NOT NULL) \
AND (latitude IS NOT NULL) \
"
bigquery_response = client.query(sql_query)
longitude_latitude = bigquery_response.to_dataframe()
First let's ensure the distributions of longitudes and latitudes values make sense¶
In [15]:
def clean_longitude_latitude(dataframe):
print("Shape before removing faulty longitudes:", dataframe.shape)
longitude_rows_to_drop = dataframe[dataframe["longitude"] < -74.27].index
dataframe.drop(labels=longitude_rows_to_drop, inplace=True)
longitude_rows_to_drop = dataframe[dataframe["longitude"] > -73.68].index
dataframe.drop(labels=longitude_rows_to_drop, inplace=True)
print("Shape after removing faulty longitudes:", dataframe.shape, "\n")
print("Shape before removing faulty latitudes:", dataframe.shape)
latitude_rows_to_drop = dataframe[dataframe["latitude"] < 40.49].index
dataframe.drop(labels=latitude_rows_to_drop, inplace=True)
latitude_rows_to_drop = dataframe[dataframe["latitude"] > 40.90].index
dataframe.drop(labels=latitude_rows_to_drop, inplace=True)
print("Shape after removing faulty latitudes:", dataframe.shape)
return dataframe
In [16]:
longitude_latitude = clean_longitude_latitude(longitude_latitude)
Shape before removing faulty longitudes: (1837243, 2) Shape after removing faulty longitudes: (1832753, 2) Shape before removing faulty latitudes: (1832753, 2) Shape after removing faulty latitudes: (1827802, 2)
In [17]:
plt.figure(figsize=(5, 3))
longitude_latitude["longitude"].hist(bins=100)
plt.title("Distribution of Longitudes")
plt.xlabel("Longitude Positions")
plt.ylabel("Number of Datapoints")
plt.show()
In [18]:
plt.figure(figsize=(5, 3))
longitude_latitude["latitude"].hist(bins=100)
plt.title("Distribution of Latitudes")
plt.xlabel("Latitudes Positions")
plt.ylabel("Number of Datapoints")
plt.show()
In [19]:
plt.scatter(x=longitude_latitude["longitude"], y=longitude_latitude["latitude"],
color="green", s=10, alpha=0.002)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Heatmap of Collisions")
plt.show()
Lethal collisions and the resulting deaths/injuries¶
In [20]:
sql_query = " \
SELECT number_of_persons_killed, COUNT(*) AS number_of_collisions \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
GROUP BY number_of_persons_killed \
ORDER BY number_of_collisions DESC \
"
bigquery_response = client.query(sql_query)
number_of_collisions_by_deaths = bigquery_response.to_dataframe()
In [21]:
number_of_collisions_by_deaths = number_of_collisions_by_deaths[number_of_collisions_by_deaths["number_of_persons_killed"] > 0]
number_of_collisions_by_deaths
Out[21]:
number_of_persons_killed | number_of_collisions | |
---|---|---|
1 | 1 | 2871 |
2 | 2 | 74 |
3 | 3 | 12 |
4 | 4 | 3 |
5 | 5 | 2 |
6 | 8 | 1 |
In [22]:
plt.figure(figsize=(6, 3))
plt.bar(x=number_of_collisions_by_deaths["number_of_persons_killed"],
height=number_of_collisions_by_deaths["number_of_collisions"],
width=0.3, color="orange", edgecolor="black", linewidth=2.5)
plt.title("Deaths in Lethal Collisions")
plt.xlabel("Deaths")
plt.ylabel("Number of Collisions")
plt.grid(True)
plt.show()
Scatter plot of where the lethal collisions most frequently occur¶
In [23]:
sql_query = " \
SELECT number_of_persons_killed, \
number_of_pedestrians_killed, \
number_of_cyclist_killed, \
number_of_motorist_killed, \
longitude, latitude \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
WHERE number_of_persons_killed > 0 \
AND (longitude IS NOT NULL) \
AND (latitude IS NOT NULL) \
"
bigquery_response = client.query(sql_query)
lethal_collisions = bigquery_response.to_dataframe()
In [24]:
lethal_collisions.head()
Out[24]:
number_of_persons_killed | number_of_pedestrians_killed | number_of_cyclist_killed | number_of_motorist_killed | longitude | latitude | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0.000000 | 0.000000 |
1 | 1 | 1 | 0 | 0 | -73.987960 | 40.576275 |
2 | 1 | 0 | 0 | 1 | -73.921570 | 40.583690 |
3 | 1 | 0 | 0 | 1 | -73.997574 | 40.594550 |
4 | 1 | 1 | 0 | 0 | -74.079817 | 40.606103 |
In [25]:
print(lethal_collisions.shape)
# drop null values
lethal_collisions.dropna(inplace=True)
# remove faulty longitude and latitude values
lethal_collisions = clean_longitude_latitude(lethal_collisions)
(2580, 6) Shape before removing faulty longitudes: (2580, 6) Shape after removing faulty longitudes: (2570, 6) Shape before removing faulty latitudes: (2570, 6) Shape after removing faulty latitudes: (2562, 6)
In [26]:
lethal_collisions["number_of_persons_killed"] = lethal_collisions["number_of_persons_killed"].astype(np.float16)
In [27]:
plt.scatter(x=lethal_collisions["longitude"],
y=lethal_collisions["latitude"],
s=lethal_collisions["number_of_persons_killed"] * 10,
color="red", edgecolor="black", alpha=0.6)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Heatmap of Lethal Collisions")
plt.grid(True)
plt.show()
Number of pedestrians killed in lethal collisions¶
In [28]:
group_by_pedestrian_deaths = lethal_collisions.groupby("number_of_pedestrians_killed") \
.count().reset_index()
group_by_pedestrian_deaths = group_by_pedestrian_deaths[["number_of_pedestrians_killed", "number_of_persons_killed"]]
group_by_pedestrian_deaths.rename(columns={ "number_of_persons_killed": "number_of_lethal_collisions" }, inplace=True)
group_by_pedestrian_deaths
Out[28]:
number_of_pedestrians_killed | number_of_lethal_collisions | |
---|---|---|
0 | 0 | 1227 |
1 | 1 | 1324 |
2 | 2 | 10 |
3 | 6 | 1 |
In [29]:
plt.figure(figsize=(5, 3))
plt.bar(x=group_by_pedestrian_deaths["number_of_pedestrians_killed"],
height=group_by_pedestrian_deaths["number_of_lethal_collisions"],
width=0.4, color="red", alpha=0.9, edgecolor="black", linewidth=2.5)
plt.title("Pedestrian Deaths in Lethal Collisions")
plt.xlabel("Number of Pedestrian Deaths")
plt.ylabel("Number of Lethal Collisions")
plt.grid(True)
plt.show()
Number of pedestrians killed in lethal collisions¶
In [30]:
group_by_cyclist_deaths = lethal_collisions.groupby("number_of_cyclist_killed") \
.count().reset_index()
group_by_cyclist_deaths = group_by_cyclist_deaths[["number_of_cyclist_killed", "number_of_persons_killed"]]
group_by_cyclist_deaths.rename(columns={ "number_of_persons_killed": "number_of_lethal_collisions" }, inplace=True)
group_by_cyclist_deaths
Out[30]:
number_of_cyclist_killed | number_of_lethal_collisions | |
---|---|---|
0 | 0 | 2355 |
1 | 1 | 206 |
2 | 2 | 1 |
In [31]:
plt.figure(figsize=(3, 3))
plt.bar(x=group_by_cyclist_deaths["number_of_cyclist_killed"],
height=group_by_cyclist_deaths["number_of_lethal_collisions"],
width=0.4, color="red", alpha=0.9, edgecolor="black", linewidth=2.5)
plt.title("Cyclist Deaths in Lethal Collisions")
plt.xlabel("Number of Cyclist Deaths")
plt.ylabel("Number of Lethal Collisions")
plt.grid(True)
plt.show()
Number of motorists killed in lethal collisions¶
In [32]:
group_by_motorist_deaths = lethal_collisions.groupby("number_of_motorist_killed") \
.count().reset_index()
group_by_motorist_deaths = group_by_motorist_deaths[["number_of_motorist_killed", "number_of_persons_killed"]]
group_by_motorist_deaths.rename(columns={ "number_of_persons_killed": "number_of_lethal_collisions" }, inplace=True)
group_by_motorist_deaths
Out[32]:
number_of_motorist_killed | number_of_lethal_collisions | |
---|---|---|
0 | 0 | 1585 |
1 | 1 | 913 |
2 | 2 | 49 |
3 | 3 | 12 |
4 | 4 | 2 |
5 | 5 | 1 |
In [33]:
plt.figure(figsize=(5, 3))
plt.bar(x=group_by_motorist_deaths["number_of_motorist_killed"],
height=group_by_motorist_deaths["number_of_lethal_collisions"],
width=0.4, color="red", alpha=0.9, edgecolor="black", linewidth=2.5)
plt.title("Motorist Deaths in Lethal Collisions")
plt.xlabel("Number of Motorist Deaths")
plt.ylabel("Number of Lethal Collisions")
plt.grid(True)
plt.show()
Graphing number of injuries caused by collisions¶
In [34]:
sql_query = " \
SELECT number_of_persons_injured, \
number_of_pedestrians_injured, \
number_of_cyclist_injured, \
number_of_motorist_injured, \
longitude, latitude \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
WHERE number_of_persons_injured > 0 \
AND (longitude IS NOT NULL) \
AND (latitude IS NOT NULL) \
"
bigquery_response = client.query(sql_query)
injury_collisions = bigquery_response.to_dataframe()
In [35]:
injury_collisions.head()
Out[35]:
number_of_persons_injured | number_of_pedestrians_injured | number_of_cyclist_injured | number_of_motorist_injured | longitude | latitude | |
---|---|---|---|---|---|---|
0 | 6 | 0 | 0 | 6 | -74.169840 | 40.570877 |
1 | 6 | 0 | 0 | 6 | -74.155600 | 40.582256 |
2 | 7 | 0 | 0 | 7 | -73.941542 | 40.585229 |
3 | 6 | 0 | 0 | 6 | -73.997574 | 40.594550 |
4 | 6 | 0 | 0 | 6 | -73.772835 | 40.595802 |
In [36]:
print(injury_collisions.shape)
# drop null values
injury_collisions.dropna(inplace=True)
# remove faulty longitude and latitude values
injury_collisions = clean_longitude_latitude(injury_collisions)
(423366, 6) Shape before removing faulty longitudes: (423366, 6) Shape after removing faulty longitudes: (421972, 6) Shape before removing faulty latitudes: (421972, 6) Shape after removing faulty latitudes: (420899, 6)
In [37]:
injury_collisions["number_of_persons_injured"] = injury_collisions["number_of_persons_injured"].astype(np.float16)
In [38]:
plt.scatter(x=injury_collisions["longitude"],
y=injury_collisions["latitude"],
s=injury_collisions["number_of_persons_injured"],
color="red", edgecolor="black", alpha=0.6)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Heatmap of Collisions Causing Injury")
plt.grid(True)
plt.show()
Pedestrians injured due to collision¶
In [39]:
group_by_pedestrian_injuries = injury_collisions.groupby("number_of_pedestrians_injured") \
.count().reset_index()
group_by_pedestrian_injuries = group_by_pedestrian_injuries[["number_of_pedestrians_injured", "number_of_persons_injured"]]
group_by_pedestrian_injuries.rename(columns={ "number_of_persons_injured": "number_of_collisions_causing_injury" }, inplace=True)
group_by_pedestrian_injuries
Out[39]:
number_of_pedestrians_injured | number_of_collisions_causing_injury | |
---|---|---|
0 | 0 | 317272 |
1 | 1 | 99848 |
2 | 2 | 3340 |
3 | 3 | 339 |
4 | 4 | 57 |
5 | 5 | 21 |
6 | 6 | 11 |
7 | 7 | 4 |
8 | 8 | 1 |
9 | 9 | 2 |
10 | 13 | 1 |
11 | 15 | 1 |
12 | 19 | 1 |
13 | 27 | 1 |
In [40]:
plt.figure(figsize=(6, 3))
plt.bar(x=group_by_pedestrian_injuries["number_of_pedestrians_injured"],
height=group_by_pedestrian_injuries["number_of_collisions_causing_injury"],
width=0.4, color="blue", alpha=0.7, edgecolor="black", linewidth=2.5)
plt.title("(Log Scale) Pedestrians Injured in Injury Causing Collisions")
plt.yscale("log")
plt.xlabel("Pedestrians Injured")
plt.ylabel("Number of Collisions (Log Scale)")
plt.grid(True)
plt.show()
Cyclists injured in injury causing collisions¶
In [41]:
group_by_cyclist_injuries = injury_collisions.groupby("number_of_cyclist_injured") \
.count().reset_index()
group_by_cyclist_injuries = group_by_cyclist_injuries[["number_of_cyclist_injured", "number_of_persons_injured"]]
group_by_cyclist_injuries.rename(columns={ "number_of_persons_injured": "number_of_collisions_causing_injury" }, inplace=True)
group_by_cyclist_injuries
Out[41]:
number_of_cyclist_injured | number_of_collisions_causing_injury | |
---|---|---|
0 | 0 | 370242 |
1 | 1 | 50124 |
2 | 2 | 514 |
3 | 3 | 18 |
4 | 4 | 1 |
In [42]:
plt.figure(figsize=(4, 3))
plt.bar(x=group_by_cyclist_injuries["number_of_cyclist_injured"],
height=group_by_cyclist_injuries["number_of_collisions_causing_injury"],
width=0.4, color="blue", alpha=0.7, edgecolor="black", linewidth=2.5)
plt.title("(Log Scale) Cyclists Injured in Injury Causing Collisions")
plt.yscale("log")
plt.xlabel("Cyclists Injured")
plt.ylabel("Number of Collisions (Log Scale)")
plt.grid(True)
plt.show()
In [43]:
group_by_motorist_injuries = injury_collisions.groupby("number_of_motorist_injured") \
.count().reset_index()
group_by_motorist_injuries = group_by_motorist_injuries[["number_of_motorist_injured", "number_of_persons_injured"]]
group_by_motorist_injuries.rename(columns={ "number_of_persons_injured": "number_of_collisions_causing_injury" }, inplace=True)
group_by_motorist_injuries.head()
Out[43]:
number_of_motorist_injured | number_of_collisions_causing_injury | |
---|---|---|
0 | 0 | 158432 |
1 | 1 | 176768 |
2 | 2 | 54813 |
3 | 3 | 18970 |
4 | 4 | 7103 |
In [44]:
plt.figure(figsize=(10, 4))
plt.bar(x=group_by_motorist_injuries["number_of_motorist_injured"],
height=group_by_motorist_injuries["number_of_collisions_causing_injury"],
width=0.4, color="blue", alpha=0.7, edgecolor="black", linewidth=2.5)
plt.title("(Log Scale) Motorists Injured in Injury Causing Collisions")
plt.yscale("log")
plt.xlabel("Motorists Injured")
plt.ylabel("Number of Collisions (Log Scale)")
plt.grid(True)
plt.show()
First group of factors contributing to collisions¶
In [45]:
sql_query = " \
SELECT contributing_factor_vehicle_1, COUNT(*) AS number_of_collisions \
FROM `bigquery-public-data.new_york_mv_collisions.nypd_mv_collisions` \
GROUP BY contributing_factor_vehicle_1 \
ORDER BY number_of_collisions DESC \
"
bigquery_response = client.query(sql_query)
first_factor_contributing_to_collisions = bigquery_response.to_dataframe()
In [46]:
first_factor_contributing_to_collisions.head(10)
Out[46]:
contributing_factor_vehicle_1 | number_of_collisions | |
---|---|---|
0 | Unspecified | 705582 |
1 | Driver Inattention/Distraction | 414046 |
2 | Failure to Yield Right-of-Way | 122943 |
3 | Following Too Closely | 110620 |
4 | Backing Unsafely | 76582 |
5 | Other Vehicular | 64252 |
6 | Passing or Lane Usage Improper | 57646 |
7 | Passing Too Closely | 51525 |
8 | Turning Improperly | 51127 |
9 | Fatigued/Drowsy | 47397 |
In [47]:
plt.figure(figsize=(17, 7))
plt.bar(x=first_factor_contributing_to_collisions[:20]["contributing_factor_vehicle_1"],
height=first_factor_contributing_to_collisions[:20]["number_of_collisions"],
width=0.5, color="purple", edgecolor="black", linewidth=2.5)
plt.xticks(rotation=90)
plt.yscale("log")
plt.title("(Log Scale) First Group of Factors Contributing to Collisions")
plt.xlabel("Collision Contributing Factors")
plt.ylabel("Number of Collisions (Log Scale)")
plt.grid(True)
plt.show()
In [ ]:
jupyter nbconvert --to html --no-input .ipynb