In [1]:
import cudf
import cuml

import cuxfilter as cxf

In [2]:
gdf = cudf.read_csv('./data/pop_sample.csv', dtype=['float32', 'float32', 'float32'])
print(gdf.dtypes)
gdf.shape

northing    float32
easting     float32
infected    float32
dtype: object


(1000000, 3)

In [3]:
gdf.head()

Unnamed: 0,northing,easting,infected
0,178547.296875,368012.125,0.0
1,174068.28125,543802.125,0.0
2,358293.6875,435639.875,0.0
3,87240.304688,389607.375,0.0
4,158261.015625,340764.9375,0.0


In [4]:
gdf['infected'].value_counts()

infected
0.0    984331
1.0     15669
Name: count, dtype: int64

In [200]:
dbscan = cuml.DBSCAN(eps=10000, min_samples=5)
# dbscan = cuml.DBSCAN(eps=5000)

infected_df = gdf[gdf['infected'] == 1].reset_index()
infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])

# filer out ungrouped
infected_df = infected_df[infected_df['cluster'] != -1].reset_index()


ncolors = infected_df["cluster"].nunique()
ncolors

9

In [201]:
infected_df.head()

Unnamed: 0,level_0,index,northing,easting,infected,cluster
0,0,41,595509.25,426545.59375,1.0,4
1,1,182,185086.609375,547718.0,1.0,0
2,2,190,183406.546875,528708.0625,1.0,0
3,3,216,373846.5625,485603.84375,1.0,0
4,4,266,194947.40625,535378.375,1.0,0


In [202]:
infected_df["cluster"].unique()

0    4
1    0
2    1
3    7
4    2
5    5
6    6
7    3
8    8
Name: cluster, dtype: int32

In [203]:
from pyproj import CRS
import numpy as np
from pyproj import Transformer

crs_UK = CRS.from_epsg(27700) # GB coord system
crs_world = CRS.from_epsg(3857) # used by maps

In [204]:
transformed = Transformer.from_crs(crs_UK, crs_world).transform(infected_df["easting"].to_numpy(), infected_df["northing"].to_numpy())

In [205]:
tdf = cudf.DataFrame(np.column_stack(transformed), columns=["easting", "northing"])
tdf["cluster"]=infected_df["cluster"]

In [206]:
tdf.head()

Unnamed: 0,easting,northing,cluster
0,-176323.847587,7411134.0,4
1,14346.823349,6718332.0,0
2,-16220.876885,6716462.0,0
3,-79964.564629,7030256.0,0
4,-5026.734982,6734763.0,0


In [207]:
cxf_data = cxf.DataFrame.from_dataframe(tdf)

In [208]:
from bokeh.palettes import Turbo256, Category20

In [209]:
scatter_chart = cxf.charts.datashader.scatter(
    x='easting', y='northing', 
    aggregate_col="cluster",
    aggregate_fn="max",
    tile_provider="CartoDark",
    legend=False,
    color_palette=Category20[ncolors],
    # color_palette=Turbo256,
    point_size=5,
    point_shape="rect_vertical",
    pixel_density=1, #0.8,
    # unselected_alpha=0,
)

cluster_widget = cxf.charts.panel_widgets.multi_select('cluster')


inf_density_chart = cxf.charts.datashader.scatter(
    x='easting', y='northing', 
    aggregate_col="northing",
    aggregate_fn="count",
    tile_provider="CartoDark",
    point_shape="rect_vertical",
    color_palette=Turbo256,
    legend=False,
    unselected_alpha=0,
)


In [210]:
if dash != None:
    dash.stop()

In [211]:
dash = cxf_data.dashboard(charts=[scatter_chart, inf_density_chart], sidebar=[cluster_widget], theme=cxf.themes.dark, layout=cxf.layouts.double_feature, data_size_widget=True)

In [212]:
dash.show("http://44.222.106.171/lab/proxy/8789", port=8789)

Dashboard running at port 8789


In [None]:
dash.app()

In [84]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

**Well Done!** Let's move to the [next notebook](3-04_logistic_regression.ipynb). 

<img src="./images/DLI_Header.png" width=400/>