{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "aa17ff76-7c65-44e2-b36e-23ad9ffe4c9c", "metadata": {}, "outputs": [], "source": [ "import cudf\n", "import cupy as cp\n", "import numpy as np\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "170ea4a3-1b4d-41c3-9e17-fd8943dc6b31", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'status': 'ok', 'restart': True}" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import IPython\n", "app = IPython.Application.instance()\n", "app.kernel.do_shutdown(True)" ] }, { "cell_type": "code", "execution_count": null, "id": "b5a0b20f-ff99-401d-804c-568f0bfb2f11", "metadata": {}, "outputs": [], "source": [ "DIR = \"./data\"\n", "BIG_KM = 2000.0\n", "MAX_GAP_HOURS = 6\n", "MAX_GAP = np.timedelta64(MAX_GAP_HOURS, \"h\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "fc1282ce-b717-453e-bc64-9fbb758a2fdc", "metadata": {}, "outputs": [], "source": [ "users = cudf.read_csv(f\"{DIR}/users.csv\", dtype={\"id\": \"uint64\"})\n", "flights = cudf.read_csv(f\"{DIR}/flights.csv\")\n", "user_flights = cudf.read_csv(\n", " f\"{DIR}/user_flights.csv\", dtype={\"user_id\": \"uint64\", \"flight_id\": \"uint64\"}\n", ")\n", "cards = cudf.read_csv(\n", " f\"{DIR}/cards.csv\", dtype={\"id\": \"uint64\", \"user_id\": \"uint64\", \"number\": \"uint64\"}\n", ")\n", "card_flights = cudf.read_csv(\n", " f\"{DIR}/card_flights.csv\", dtype={\"card_id\": \"uint64\", \"flight_id\": \"uint64\"}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "bdcaa3d1-6414-425f-9a40-f65f74c004b6", "metadata": {}, "outputs": [], "source": [ "has_time = flights[\"has_time\"].astype(\"str\").str.strip().str.lower()\n", "has_time = (has_time == \"true\")\n", "\n", "date = flights[\"dep_date\"].fillna(\"\").astype(\"str\").str.strip()\n", "time = flights[\"dep_time\"].fillna(\"\").astype(\"str\").str.strip()\n", "\n", "mask_date = date.str.match(r\"^\\d{4}-\\d{2}-\\d{2}$\")\n", "mask_time = time.str.match(r\"^\\d{2}:\\d{2}(:\\d{2})?$\") \n", "\n", "mask_dt = mask_date & mask_time & has_time \n", "mask_d = mask_date & (~has_time | ~mask_time) \n" ] }, { "cell_type": "code", "execution_count": 9, "id": "302947fa-af35-4fad-91bc-6da2468d9708", "metadata": {}, "outputs": [], "source": [ "for c in (\"fromlat\", \"fromlon\", \"tolat\", \"tolon\"):\n", " flights[c] = flights[c].astype(\"float64\")\n", "\n", "R = cp.float64(6371.0088)\n", "rad = cp.float64(cp.pi / 180.0)\n", "\n", "lat1 = flights[\"fromlat\"].values * rad\n", "lon1 = flights[\"fromlon\"].values * rad\n", "lat2 = flights[\"tolat\"].values * rad\n", "lon2 = flights[\"tolon\"].values * rad\n", "\n", "dlat = lat2 - lat1\n", "dlon = lon2 - lon1\n", "a = cp.sin(dlat * 0.5)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon * 0.5)**2\n", "flights[\"distance_km\"] = cudf.Series(R * (2.0 * cp.arcsin(cp.sqrt(a))))" ] }, { "cell_type": "code", "execution_count": null, "id": "b141f399-4f52-40d0-a2f6-6434ab1ebb74", "metadata": {}, "outputs": [], "source": [ "map_user_flights = user_flights[[\"user_id\", \"flight_id\"]]\n", "\n", "cards_min = cards[[\"id\", \"user_id\"]].rename(columns={\"id\": \"card_id\"})\n", "map_card_users = card_flights.merge(cards_min, on=\"card_id\", how=\"left\")[[\"user_id\", \"flight_id\"]]\n", "\n", "user_flight_map = cudf.concat([map_user_flights, map_card_users], ignore_index=True)\n", "user_flight_map = user_flight_map.dropna(subset=[\"user_id\", \"flight_id\"]).drop_duplicates()\n", "\n", "uf = user_flight_map.merge(\n", " flights,\n", " left_on=\"flight_id\",\n", " right_on=\"id\",\n", " how=\"inner\", \n", ")\n", "uf = uf.dropna(subset=[\"dep_ts\"]).sort_values([\"user_id\", \"dep_ts\", \"flight_id\"])" ] }, { "cell_type": "code", "execution_count": 11, "id": "0cb5b280-451b-4cf1-ad61-de990b490d63", "metadata": {}, "outputs": [], "source": [ "uid = \"user_id\"" ] }, { "cell_type": "code", "execution_count": 12, "id": "44c7edf4-276d-49c2-a93d-2b8a60bcfaab", "metadata": {}, "outputs": [], "source": [ "prev_dep = uf.groupby(uid)[\"dep_ts\"].shift(1)\n", "gap = uf[\"dep_ts\"] - prev_dep\n", "cond_lt6h = gap <= MAX_GAP\n", "is_new_seg = cond_lt6h.isna() | (~cond_lt6h)\n", "seg_id = is_new_seg.astype(\"int32\").groupby(uf[uid]).cumsum()\n", "streak_sizes = uf.groupby([uid, seg_id]).size().rename(\"streak_len\")\n", "max_streak_lt6h = streak_sizes.groupby(level=0).max().rename(\"max_streak_lt6h\")\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "6cbe6321-2033-413e-9dd4-305c339361fc", "metadata": {}, "outputs": [], "source": [ "big = uf[\"distance_km\"] >= BIG_KM\n", "big_new = big.isna() | (~big)\n", "big_seg = big_new.astype(\"int32\").groupby(uf[uid]).cumsum()\n", "big_sizes = uf[big].groupby([uid, big_seg]).size().rename(\"big_run_len\")\n", "count_big_streaks_ge3 = (big_sizes >= 3).groupby(level=0).sum().astype(\"int64\").rename(\"count_big_streaks_ge3\")\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "6668873f-f36d-4423-a735-d744647db88e", "metadata": {}, "outputs": [], "source": [ "route = uf[\"from\"].astype(\"str\") + \"→\" + uf[\"to\"].astype(\"str\")\n", "route_prev = route.groupby(uf[uid]).shift(1)\n", "route_change = route_prev.isna() | (route != route_prev)\n", "route_seg = route_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n", "route_run_sizes = uf.groupby([uid, route_seg]).size().rename(\"route_run_len\")\n", "max_consec_same_route = route_run_sizes.groupby(level=0).max().rename(\"max_consec_same_route\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b3d44e46-147b-433f-b985-5d6b3baefd6e", "metadata": {}, "outputs": [], "source": [ "orig = uf[\"from\"].astype(\"str\")\n", "orig_prev = orig.groupby(uf[uid]).shift(1)\n", "orig_change = orig_prev.isna() | (orig != orig_prev)\n", "orig_seg = orig_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n", "orig_run_sizes = uf.groupby([uid, orig_seg]).size().rename(\"orig_run_len\")\n", "max_consec_same_origin = orig_run_sizes.groupby(level=0).max().rename(\"max_consec_same_origin\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "52028ba2-bd19-44fc-a7eb-bf76be246b79", "metadata": {}, "outputs": [], "source": [ "longest_distance_km = uf.groupby(uid)[\"distance_km\"].max().rename(\"longest_distance_km\")\n", "\n", "gap_days = (gap / np.timedelta64(1, \"D\")).astype(\"float64\")\n", "avg_days_between = gap_days.groupby(uf[uid]).mean().rename(\"avg_days_between_flights\")\n", "\n", "unique_airports = cudf.concat(\n", " [\n", " uf[[uid, \"from\"]].rename(columns={\"from\": \"airport\"}),\n", " uf[[uid, \"to\"]].rename(columns={\"to\": \"airport\"}),\n", " ],\n", " ignore_index=True,\n", ").groupby(uid)[\"airport\"].nunique().astype(\"int64\").rename(\"unique_airports\")\n" ] }, { "cell_type": "code", "execution_count": 17, "id": "88e632af-278b-43af-a15d-7e9e9235afe6", "metadata": {}, "outputs": [], "source": [ "metrics = (\n", " max_streak_lt6h.to_frame()\n", " .join(count_big_streaks_ge3, how=\"left\")\n", " .join(max_consec_same_route, how=\"left\")\n", " .join(max_consec_same_origin, how=\"left\")\n", " .join(unique_airports, how=\"left\")\n", " .join(longest_distance_km, how=\"left\")\n", " .join(avg_days_between, how=\"left\")\n", ").fillna({\n", " \"max_streak_lt6h\": 0,\n", " \"count_big_streaks_ge3\": 0,\n", " \"max_consec_same_route\": 0,\n", " \"max_consec_same_origin\": 0,\n", " \"unique_airports\": 0,\n", "})\n", "\n", "users_analytics = users.set_index(\"id\").join(metrics, how=\"left\")\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "951fe3b3-dac6-4c3a-8e83-80a048f5864d", "metadata": {}, "outputs": [], "source": [ "users_analytics.to_csv(f\"{DIR}/users_analytics_gpu_cudf.csv\", index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "86817955-42e9-480e-8ffd-2a628816f6a9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }