aboutsummaryrefslogtreecommitdiff
path: root/py.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'py.ipynb')
-rw-r--r--py.ipynb296
1 files changed, 296 insertions, 0 deletions
diff --git a/py.ipynb b/py.ipynb
new file mode 100644
index 0000000..b903e36
--- /dev/null
+++ b/py.ipynb
@@ -0,0 +1,296 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa17ff76-7c65-44e2-b36e-23ad9ffe4c9c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import cudf\n",
+ "import cupy as cp\n",
+ "import numpy as np\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "170ea4a3-1b4d-41c3-9e17-fd8943dc6b31",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'status': 'ok', 'restart': True}"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import IPython\n",
+ "app = IPython.Application.instance()\n",
+ "app.kernel.do_shutdown(True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b5a0b20f-ff99-401d-804c-568f0bfb2f11",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DIR = \"./data\"\n",
+ "BIG_KM = 2000.0\n",
+ "MAX_GAP_HOURS = 6\n",
+ "MAX_GAP = np.timedelta64(MAX_GAP_HOURS, \"h\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "fc1282ce-b717-453e-bc64-9fbb758a2fdc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users = cudf.read_csv(f\"{DIR}/users.csv\", dtype={\"id\": \"uint64\"})\n",
+ "flights = cudf.read_csv(f\"{DIR}/flights.csv\")\n",
+ "user_flights = cudf.read_csv(\n",
+ " f\"{DIR}/user_flights.csv\", dtype={\"user_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
+ ")\n",
+ "cards = cudf.read_csv(\n",
+ " f\"{DIR}/cards.csv\", dtype={\"id\": \"uint64\", \"user_id\": \"uint64\", \"number\": \"uint64\"}\n",
+ ")\n",
+ "card_flights = cudf.read_csv(\n",
+ " f\"{DIR}/card_flights.csv\", dtype={\"card_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bdcaa3d1-6414-425f-9a40-f65f74c004b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "has_time = flights[\"has_time\"].astype(\"str\").str.strip().str.lower()\n",
+ "has_time = (has_time == \"true\")\n",
+ "\n",
+ "date = flights[\"dep_date\"].fillna(\"\").astype(\"str\").str.strip()\n",
+ "time = flights[\"dep_time\"].fillna(\"\").astype(\"str\").str.strip()\n",
+ "\n",
+ "mask_date = date.str.match(r\"^\\d{4}-\\d{2}-\\d{2}$\")\n",
+ "mask_time = time.str.match(r\"^\\d{2}:\\d{2}(:\\d{2})?$\") \n",
+ "\n",
+ "mask_dt = mask_date & mask_time & has_time \n",
+ "mask_d = mask_date & (~has_time | ~mask_time) \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "302947fa-af35-4fad-91bc-6da2468d9708",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for c in (\"fromlat\", \"fromlon\", \"tolat\", \"tolon\"):\n",
+ " flights[c] = flights[c].astype(\"float64\")\n",
+ "\n",
+ "R = cp.float64(6371.0088)\n",
+ "rad = cp.float64(cp.pi / 180.0)\n",
+ "\n",
+ "lat1 = flights[\"fromlat\"].values * rad\n",
+ "lon1 = flights[\"fromlon\"].values * rad\n",
+ "lat2 = flights[\"tolat\"].values * rad\n",
+ "lon2 = flights[\"tolon\"].values * rad\n",
+ "\n",
+ "dlat = lat2 - lat1\n",
+ "dlon = lon2 - lon1\n",
+ "a = cp.sin(dlat * 0.5)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon * 0.5)**2\n",
+ "flights[\"distance_km\"] = cudf.Series(R * (2.0 * cp.arcsin(cp.sqrt(a))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b141f399-4f52-40d0-a2f6-6434ab1ebb74",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "map_user_flights = user_flights[[\"user_id\", \"flight_id\"]]\n",
+ "\n",
+ "cards_min = cards[[\"id\", \"user_id\"]].rename(columns={\"id\": \"card_id\"})\n",
+ "map_card_users = card_flights.merge(cards_min, on=\"card_id\", how=\"left\")[[\"user_id\", \"flight_id\"]]\n",
+ "\n",
+ "user_flight_map = cudf.concat([map_user_flights, map_card_users], ignore_index=True)\n",
+ "user_flight_map = user_flight_map.dropna(subset=[\"user_id\", \"flight_id\"]).drop_duplicates()\n",
+ "\n",
+ "uf = user_flight_map.merge(\n",
+ " flights,\n",
+ " left_on=\"flight_id\",\n",
+ " right_on=\"id\",\n",
+ " how=\"inner\", \n",
+ ")\n",
+ "uf = uf.dropna(subset=[\"dep_ts\"]).sort_values([\"user_id\", \"dep_ts\", \"flight_id\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "0cb5b280-451b-4cf1-ad61-de990b490d63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "uid = \"user_id\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "44c7edf4-276d-49c2-a93d-2b8a60bcfaab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prev_dep = uf.groupby(uid)[\"dep_ts\"].shift(1)\n",
+ "gap = uf[\"dep_ts\"] - prev_dep\n",
+ "cond_lt6h = gap <= MAX_GAP\n",
+ "is_new_seg = cond_lt6h.isna() | (~cond_lt6h)\n",
+ "seg_id = is_new_seg.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
+ "streak_sizes = uf.groupby([uid, seg_id]).size().rename(\"streak_len\")\n",
+ "max_streak_lt6h = streak_sizes.groupby(level=0).max().rename(\"max_streak_lt6h\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "6cbe6321-2033-413e-9dd4-305c339361fc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "big = uf[\"distance_km\"] >= BIG_KM\n",
+ "big_new = big.isna() | (~big)\n",
+ "big_seg = big_new.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
+ "big_sizes = uf[big].groupby([uid, big_seg]).size().rename(\"big_run_len\")\n",
+ "count_big_streaks_ge3 = (big_sizes >= 3).groupby(level=0).sum().astype(\"int64\").rename(\"count_big_streaks_ge3\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "6668873f-f36d-4423-a735-d744647db88e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route = uf[\"from\"].astype(\"str\") + \"→\" + uf[\"to\"].astype(\"str\")\n",
+ "route_prev = route.groupby(uf[uid]).shift(1)\n",
+ "route_change = route_prev.isna() | (route != route_prev)\n",
+ "route_seg = route_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
+ "route_run_sizes = uf.groupby([uid, route_seg]).size().rename(\"route_run_len\")\n",
+ "max_consec_same_route = route_run_sizes.groupby(level=0).max().rename(\"max_consec_same_route\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b3d44e46-147b-433f-b985-5d6b3baefd6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "orig = uf[\"from\"].astype(\"str\")\n",
+ "orig_prev = orig.groupby(uf[uid]).shift(1)\n",
+ "orig_change = orig_prev.isna() | (orig != orig_prev)\n",
+ "orig_seg = orig_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
+ "orig_run_sizes = uf.groupby([uid, orig_seg]).size().rename(\"orig_run_len\")\n",
+ "max_consec_same_origin = orig_run_sizes.groupby(level=0).max().rename(\"max_consec_same_origin\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "52028ba2-bd19-44fc-a7eb-bf76be246b79",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "longest_distance_km = uf.groupby(uid)[\"distance_km\"].max().rename(\"longest_distance_km\")\n",
+ "\n",
+ "gap_days = (gap / np.timedelta64(1, \"D\")).astype(\"float64\")\n",
+ "avg_days_between = gap_days.groupby(uf[uid]).mean().rename(\"avg_days_between_flights\")\n",
+ "\n",
+ "unique_airports = cudf.concat(\n",
+ " [\n",
+ " uf[[uid, \"from\"]].rename(columns={\"from\": \"airport\"}),\n",
+ " uf[[uid, \"to\"]].rename(columns={\"to\": \"airport\"}),\n",
+ " ],\n",
+ " ignore_index=True,\n",
+ ").groupby(uid)[\"airport\"].nunique().astype(\"int64\").rename(\"unique_airports\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "88e632af-278b-43af-a15d-7e9e9235afe6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metrics = (\n",
+ " max_streak_lt6h.to_frame()\n",
+ " .join(count_big_streaks_ge3, how=\"left\")\n",
+ " .join(max_consec_same_route, how=\"left\")\n",
+ " .join(max_consec_same_origin, how=\"left\")\n",
+ " .join(unique_airports, how=\"left\")\n",
+ " .join(longest_distance_km, how=\"left\")\n",
+ " .join(avg_days_between, how=\"left\")\n",
+ ").fillna({\n",
+ " \"max_streak_lt6h\": 0,\n",
+ " \"count_big_streaks_ge3\": 0,\n",
+ " \"max_consec_same_route\": 0,\n",
+ " \"max_consec_same_origin\": 0,\n",
+ " \"unique_airports\": 0,\n",
+ "})\n",
+ "\n",
+ "users_analytics = users.set_index(\"id\").join(metrics, how=\"left\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "951fe3b3-dac6-4c3a-8e83-80a048f5864d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users_analytics.to_csv(f\"{DIR}/users_analytics_gpu_cudf.csv\", index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "86817955-42e9-480e-8ffd-2a628816f6a9",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}