import warnings

# disable future,user,depreciation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

df = pd.read_csv('deep_sea_corals.csv')

df.info()

/tmp/ipykernel_57624/462335268.py:1: DtypeWarning: Columns (5,7,8,13) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('deep_sea_corals.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513373 entries, 0 to 513372
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CatalogNumber            513372 non-null  float64
 1   DataProvider             513372 non-null  object 
 2   ScientificName           513372 non-null  object 
 3   VernacularNameCategory   513197 non-null  object 
 4   TaxonRank                513364 non-null  object 
 5   Station                  253590 non-null  object 
 6   ObservationDate          513367 non-null  object 
 7   latitude                 513373 non-null  object 
 8   longitude                513373 non-null  object 
 9   DepthInMeters            513372 non-null  float64
 10  DepthMethod              496845 non-null  object 
 11  Locality                 389645 non-null  object 
 12  LocationAccuracy         484662 non-null  object 
 13  SurveyID                 306228 non-null  object 
 14  Repository               496584 non-null  object 
 15  IdentificationQualifier  488591 non-null  object 
 16  EventID                  472141 non-null  object 
 17  SamplingEquipment        485883 non-null  object 
 18  RecordType               501077 non-null  object 
 19  SampleID                 402294 non-null  object 
dtypes: float64(2), object(18)
memory usage: 78.3+ MB

print('unique taxon identifiers: ', df.TaxonRank.unique())
print('sampling equipment utilized: ', df.SamplingEquipment.value_counts())

unique taxon identifiers:  [nan 'species' 'genus' 'phylum' 'order' 'family' 'suborder' 'subgenus'
 'subspecies' 'variety' 'class' 'forma' 'subfamily' 'subclass']
sampling equipment utilized:  SamplingEquipment
ROV                    326289
submersible             70268
trawl                   51899
towed camera            19626
longline                 9481
dredge                   2840
AUV                      2535
drop camera              1262
grab                      621
net                       504
corer                     212
SCUBA                     174
multiple gears             86
trap                       41
other                      20
hook and line              12
pot                         5
Cp                          2
Jsl-I-3905                  1
South Pacific Ocean         1
trawl-otter                 1
camera - drop               1
GMST                        1
GMT                         1
Name: count, dtype: int64

# filter only species type
df = df[df.TaxonRank == 'species']

# gather and rename columns. Only keep relevant columns
df = df[['ScientificName', 'ObservationDate', 'latitude', 'longitude', 'DepthInMeters', 'SamplingEquipment']]
df.columns = ['sci_name', 'date', 'lat', 'lon', 'depth_m', 'equipment']

# convert to datetimes and convert lat/lon to numeric
df.date = pd.to_datetime(df.date, format='mixed')
df.lat = pd.to_numeric(df.lat, errors='coerce')
df.lon = pd.to_numeric(df.lon, errors='coerce')

# filter nan or None
df = df.dropna()

print('unique species: ', df['sci_name'].nunique())
print('newest date: ', df['date'].min())
print('oldest date: ', df['date'].max())
print('northenmost point: ', df['lat'].max())
print('southernmost point: ', df['lat'].min())

dfcumsum = df.sci_name.value_counts().cumsum()

# what top X species make up 95% of the observations?
n95 = dfcumsum.searchsorted(df.shape[0] * 0.95)
print(f'top {n95} species make up 95% of the observations')

n50 = dfcumsum.searchsorted(df.shape[0] * 0.5)
print(f'top {n50} species make up 50% of the observations')

unique species:  1452
newest date:  1868-05-04 00:00:00
oldest date:  2016-03-27 00:00:00
northenmost point:  72.32
southernmost point:  -78.4
top 214 species make up 95% of the observations
top 10 species make up 50% of the observations

max_year = df['date'].max().year
min_year = df['date'].min().year
bins = max_year - min_year

n95 = df['date'].quantile(0.05)
print('95% of the observations were made after', n95.year)

# histogram of observations over date
plt.figure(figsize=(20, 5))
sns.histplot(data=df, x='date', bins=bins, color='blue')
plt.axvline(n95, color='red', linestyle='--')
plt.xlabel('Date')
plt.ylabel('Total Observations')
plt.show()

95% of the observations were made after 1985

species = df.groupby('sci_name').size().sort_values(ascending=False)

fig, axs = plt.subplots(figsize=(20, 10))
sns.barplot(x=species[:10].index, y=species[:10].values)
plt.xticks(rotation=45)
for bars in axs.containers:
    axs.bar_label(bars)
plt.title('Top 10 Species Observed')
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()

fig, ax = plt.subplots(figsize=(10, 10))
countries = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
countries.plot(ax=ax, color='white', edgecolor='black')

topdf = df[df['sci_name'].isin(species[:10].index)]

sns.scatterplot(
    data=topdf,
    x='lon',
    y='lat',
    hue='sci_name',
    s=20,
    edgecolor='black',
)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

top_species = species[:10].index

topdf = df[df['sci_name'].isin(top_species)]

fig, ax = plt.subplots(figsize=(20, 25))
fig.tight_layout(pad=3.0)

for index, spec in enumerate(list(top_species)):
    ax = plt.subplot(5, 2, index + 1)
    sdf = topdf[topdf.sci_name == spec]
    sns.histplot(data=sdf, x='depth_m', bins=20, color='blue')
    plt.xlabel('Depth (m)')
    plt.ylabel('Total Observations')
    plt.title(f'Depth of {spec}. Total Observations: {sdf.shape[0]}')

hr_df = df[df.sci_name == 'Heteropolypus ritteri']

# percentage of observations between 500 and 1000 meters
pha = hr_df[(hr_df.depth_m >= 200) & (hr_df.depth_m <= 2000)].shape[0] / hr_df.shape[0]
pho = hr_df[(hr_df.depth_m < 200) | (hr_df.depth_m > 2000)].shape[0] / hr_df.shape[0]

print('The hypothesis is: The species Heteropolypus Ritteri is found at depths between 200 and 2000 meters.')

print('Percentage of observations between 500 and 1000 meters: ', pha)
print('Percentage of observations outside of 500 and 1000 meters: ', pho)

if pho < 0.05:
    print('We can reject the null hypothesis that the observations are not found at depths between 200 and 2000 meters')
else:
    print('We accept the null hypothesis that the observations are not found at depths between 200 and 2000 meters')

The hypothesis is: The species Heteropolypus Ritteri is found at depths between 200 and 2000 meters.
Percentage of observations between 500 and 1000 meters:  0.9682909769176964
Percentage of observations outside of 500 and 1000 meters:  0.03170902308230357
We can reject the null hypothesis that the observations are not found at depths between 200 and 2000 meters

hr_df = df[df.sci_name == 'Heteropolypus ritteri']

# percentage of observations above latitude 30
pha = hr_df[hr_df.lat >= 30].shape[0] / hr_df.shape[0]
pho = hr_df[hr_df.lat < 30].shape[0] / hr_df.shape[0]

print('The hypothesis is: TThe species Heteropolypus Ritteri is found above latitude 30 degrees')

print('Percentage of observations above latitude 30: ', pha)
print('Percentage of observations below latitude 30: ', pho)

if pho < 0.05:
    print('We can reject the null hypothesis that the observations are not found above latitude 30 degrees')
else:
    print('We accept the null hypothesis that the observations are not found above latitude 30 degrees')

The hypothesis is: TThe species Heteropolypus Ritteri is found above latitude 30 degrees
Percentage of observations above latitude 30:  1.0
Percentage of observations below latitude 30:  0.0
We can reject the null hypothesis that the observations are not found above latitude 30 degrees

hr_df = df[df.sci_name == 'Heteropolypus ritteri']

# We are utilizing a very niave test here. This is for demonstration purposes only.
# lon between 60W and 0, lat between 50N and 50S

# percentage of observations found in Atlantic Ocean
pha = hr_df[(hr_df.lon >= -60) & (hr_df.lon <= 0) & (hr_df.lat >= -50) & (hr_df.lat <= 50)].shape[0] / hr_df.shape[0]
pho = hr_df[(hr_df.lon < -60) | (hr_df.lon > 0) | (hr_df.lat < -50) | (hr_df.lat > 50)].shape[0] / hr_df.shape[0]

print('The hypothesis is: The species Heteropolypus Ritteri is found in the Atlantic Ocean')

print('Percentage of observations found in Atlantic Ocean: ', pha)
print('Percentage of observations found outside of Atlantic Ocean: ', pho)

if pho < 0.05:
    print('We can reject the null hypothesis that the observations are not found in the Atlantic Ocean')
else:
    print('We accept the null hypothesis that the observations are not found in the Atlantic Ocean')

The hypothesis is: The species Heteropolypus Ritteri is found in the Atlantic Ocean
Percentage of observations found in Atlantic Ocean:  0.0
Percentage of observations found outside of Atlantic Ocean:  1.0
We accept the null hypothesis that the observations are not found in the Atlantic Ocean

Deep Sea Coral Analysis¶

Exploration¶

Data Visualization¶

Time Range¶

Top Most Common Species¶

Geographical Location¶

Depth Location¶

Hypothesis Testing¶

Final Remarks¶