Generating an ATL08 GeoParquet Store

Generating an ATL08 GeoParquet Store#

This notebook creates a GeoParquet store from scratch using a subset of ICESat-2 ATL08 files. GeoParquet is built on Apache Parquet which is an open-source column-oriented file format which allows for efficient storage and retrieval using high performance compression.

The conversion functions are in the helpers file atl08_parquet_helpers which are functions originally written by Sean Harkins of Development Seed in developmentseed/icesat-parquet.

Warning

This work is experimental

1. Install and import the necessary libraries#

!pip install pyarrow geoarrow-pyarrow geopandas earthaccess==0.9.0 jupyterlab_vim

import atl08_parquet_helpers as aph
from datetime import datetime, timezone, timedelta
import earthaccess
import fsspec
import geopandas as gpd
from lonboard import viz
import os
import pyarrow.parquet as pq
from shapely import wkb

2. Login to earthaccess using URS credentials and then setup an S3 client with credentials for NSIDC DAAC#

earthaccess.login()

aws_creds = earthaccess.get_s3_credentials(daac='NSIDC')

s3 = fsspec.filesystem(
    's3',
    anon=False,
    key=aws_creds['accessKeyId'],
    secret=aws_creds['secretAccessKey'],
    token=aws_creds['sessionToken'],
)

3. Search for a subset of ATL08 granules using the earthaccess library#

This search is only for 1 week for results over South America.

start = datetime(2021, 11, 1, tzinfo=timezone.utc)
end = start + timedelta(days=7)

results = earthaccess.search_data(
    short_name="ATL08",
    cloud_hosted=True,
    temporal=(start, end),
    bounding_box=(-90,-56,-32,14),
    count=-1
)
year_month = f"year={start.year}/month={start.month}"
week = 0
len(results)

4. Sort the results and setup the parquet#

sorted_results = sorted(results, key=lambda r : datetime.strptime(r['umm']['TemporalExtent']['RangeDateTime']['BeginningDateTime'], '%Y-%m-%dT%H:%M:%S.%fZ'))

template_file = s3.open(sorted_results[0].data_links(access="direct")[0], 'rb')

atl08_parquet = aph.ParquetTable(
    geometadata_file='atl08-parquet-metadata.json',
    template_file=template_file
)

5. Write results to the parquet table#

Write results to 1 parquet file, using the year-month as a partition. Later on if we add more weeks we can add them to new parquet files and new partitions as appropriate.

%%time
# fetch a group and write them to a partition
directory="atl08_parquet"
os.makedirs(f"{directory}/{year_month}", exist_ok=True)
# i think it can only go one beam at a time even with more workers because of the global hdf5 interpreter lock
atl08_parquet.write_results_by_partition(sorted_results, s3, parquet_file=f"{directory}/{year_month}/{week}.parquet")

We’re done creating the parquet!#

Now we can checkout the results.

# The hive partitioning scheme assumes directory names with key=value pairs like "/year=2009/month=11"
# Partitioning speeds up queries as the query engine only needs to look at certain paths which match the key/value pairs used in creating the partitions.
dataset = pq.ParquetDataset("atl08_parquet", partitioning="hive", filters=[('year', '>=', 2021),
                                                     ('year', '<=', 2021),
                                                     ('month', '>=', 11),
                                                     ('month', '<=', 11)])
table = dataset.read(columns=["h_canopy", "geometry"])
df = table.to_pandas()
df['geometry'] = df['geometry'].apply(wkb.loads)


gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
gdf

null_value = gdf['h_canopy'].max() 
gdf_filtered = gdf.loc[gdf['h_canopy'] != null_value]
gdf_filtered

gdf_filtered['h_canopy'].min(), gdf_filtered['h_canopy'].max()

%%time
# this will take too long and / or cause the kernel to die with large dataframes
# depending on your available memory
import matplotlib.pyplot as plt
import matplotlib
import cartopy.crs as ccrs

crs = ccrs.PlateCarree()
fig, ax = plt.subplots(subplot_kw=dict(projection=crs))
gdf_filtered.plot(column='h_canopy', ax=ax, legend=True, cmap='viridis')
ax.set_extent([-116,-23,-32,21])
ax.set_title('h_canopy plot')

# Add coastlines and gridlines
ax.coastlines()
ax.gridlines(draw_labels=True)

# Show plot
plt.show()

%%time
from lonboard import Map, ScatterplotLayer
from lonboard.colormap import apply_continuous_cmap
from palettable.colorbrewer.diverging import BrBG_10

layer = ScatterplotLayer.from_geopandas(gdf_filtered)
h_canopy = gdf_filtered['h_canopy']
layer.get_fill_color = apply_continuous_cmap(h_canopy, BrBG_10, alpha=0.7)

m = Map(layer)
m