From bb833561aa74f02970aee13cdc75973b29716491 Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Mon, 27 Oct 2025 20:36:28 +0300 Subject: # This is a combination of 2 commits. # This is the 1st commit message: unmarshal all formats, merge them in the single table, users are truly unique # This is the commit message #2: i --- py.ipynb | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 py.ipynb (limited to 'py.ipynb') diff --git a/py.ipynb b/py.ipynb new file mode 100644 index 0000000..b903e36 --- /dev/null +++ b/py.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aa17ff76-7c65-44e2-b36e-23ad9ffe4c9c", + "metadata": {}, + "outputs": [], + "source": [ + "import cudf\n", + "import cupy as cp\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "170ea4a3-1b4d-41c3-9e17-fd8943dc6b31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5a0b20f-ff99-401d-804c-568f0bfb2f11", + "metadata": {}, + "outputs": [], + "source": [ + "DIR = \"./data\"\n", + "BIG_KM = 2000.0\n", + "MAX_GAP_HOURS = 6\n", + "MAX_GAP = np.timedelta64(MAX_GAP_HOURS, \"h\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc1282ce-b717-453e-bc64-9fbb758a2fdc", + "metadata": {}, + "outputs": [], + "source": [ + "users = cudf.read_csv(f\"{DIR}/users.csv\", dtype={\"id\": \"uint64\"})\n", + "flights = cudf.read_csv(f\"{DIR}/flights.csv\")\n", + "user_flights = cudf.read_csv(\n", + " f\"{DIR}/user_flights.csv\", dtype={\"user_id\": \"uint64\", \"flight_id\": \"uint64\"}\n", + ")\n", + "cards = cudf.read_csv(\n", + " f\"{DIR}/cards.csv\", dtype={\"id\": \"uint64\", \"user_id\": \"uint64\", \"number\": \"uint64\"}\n", + ")\n", + "card_flights = cudf.read_csv(\n", + " f\"{DIR}/card_flights.csv\", dtype={\"card_id\": \"uint64\", \"flight_id\": \"uint64\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdcaa3d1-6414-425f-9a40-f65f74c004b6", + "metadata": {}, + "outputs": [], + "source": [ + "has_time = flights[\"has_time\"].astype(\"str\").str.strip().str.lower()\n", + "has_time = (has_time == \"true\")\n", + "\n", + "date = flights[\"dep_date\"].fillna(\"\").astype(\"str\").str.strip()\n", + "time = flights[\"dep_time\"].fillna(\"\").astype(\"str\").str.strip()\n", + "\n", + "mask_date = date.str.match(r\"^\\d{4}-\\d{2}-\\d{2}$\")\n", + "mask_time = time.str.match(r\"^\\d{2}:\\d{2}(:\\d{2})?$\") \n", + "\n", + "mask_dt = mask_date & mask_time & has_time \n", + "mask_d = mask_date & (~has_time | ~mask_time) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "302947fa-af35-4fad-91bc-6da2468d9708", + "metadata": {}, + "outputs": [], + "source": [ + "for c in (\"fromlat\", \"fromlon\", \"tolat\", \"tolon\"):\n", + " flights[c] = flights[c].astype(\"float64\")\n", + "\n", + "R = cp.float64(6371.0088)\n", + "rad = cp.float64(cp.pi / 180.0)\n", + "\n", + "lat1 = flights[\"fromlat\"].values * rad\n", + "lon1 = flights[\"fromlon\"].values * rad\n", + "lat2 = flights[\"tolat\"].values * rad\n", + "lon2 = flights[\"tolon\"].values * rad\n", + "\n", + "dlat = lat2 - lat1\n", + "dlon = lon2 - lon1\n", + "a = cp.sin(dlat * 0.5)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon * 0.5)**2\n", + "flights[\"distance_km\"] = cudf.Series(R * (2.0 * cp.arcsin(cp.sqrt(a))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b141f399-4f52-40d0-a2f6-6434ab1ebb74", + "metadata": {}, + "outputs": [], + "source": [ + "map_user_flights = user_flights[[\"user_id\", \"flight_id\"]]\n", + "\n", + "cards_min = cards[[\"id\", \"user_id\"]].rename(columns={\"id\": \"card_id\"})\n", + "map_card_users = card_flights.merge(cards_min, on=\"card_id\", how=\"left\")[[\"user_id\", \"flight_id\"]]\n", + "\n", + "user_flight_map = cudf.concat([map_user_flights, map_card_users], ignore_index=True)\n", + "user_flight_map = user_flight_map.dropna(subset=[\"user_id\", \"flight_id\"]).drop_duplicates()\n", + "\n", + "uf = user_flight_map.merge(\n", + " flights,\n", + " left_on=\"flight_id\",\n", + " right_on=\"id\",\n", + " how=\"inner\", \n", + ")\n", + "uf = uf.dropna(subset=[\"dep_ts\"]).sort_values([\"user_id\", \"dep_ts\", \"flight_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0cb5b280-451b-4cf1-ad61-de990b490d63", + "metadata": {}, + "outputs": [], + "source": [ + "uid = \"user_id\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "44c7edf4-276d-49c2-a93d-2b8a60bcfaab", + "metadata": {}, + "outputs": [], + "source": [ + "prev_dep = uf.groupby(uid)[\"dep_ts\"].shift(1)\n", + "gap = uf[\"dep_ts\"] - prev_dep\n", + "cond_lt6h = gap <= MAX_GAP\n", + "is_new_seg = cond_lt6h.isna() | (~cond_lt6h)\n", + "seg_id = is_new_seg.astype(\"int32\").groupby(uf[uid]).cumsum()\n", + "streak_sizes = uf.groupby([uid, seg_id]).size().rename(\"streak_len\")\n", + "max_streak_lt6h = streak_sizes.groupby(level=0).max().rename(\"max_streak_lt6h\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6cbe6321-2033-413e-9dd4-305c339361fc", + "metadata": {}, + "outputs": [], + "source": [ + "big = uf[\"distance_km\"] >= BIG_KM\n", + "big_new = big.isna() | (~big)\n", + "big_seg = big_new.astype(\"int32\").groupby(uf[uid]).cumsum()\n", + "big_sizes = uf[big].groupby([uid, big_seg]).size().rename(\"big_run_len\")\n", + "count_big_streaks_ge3 = (big_sizes >= 3).groupby(level=0).sum().astype(\"int64\").rename(\"count_big_streaks_ge3\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6668873f-f36d-4423-a735-d744647db88e", + "metadata": {}, + "outputs": [], + "source": [ + "route = uf[\"from\"].astype(\"str\") + \"→\" + uf[\"to\"].astype(\"str\")\n", + "route_prev = route.groupby(uf[uid]).shift(1)\n", + "route_change = route_prev.isna() | (route != route_prev)\n", + "route_seg = route_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n", + "route_run_sizes = uf.groupby([uid, route_seg]).size().rename(\"route_run_len\")\n", + "max_consec_same_route = route_run_sizes.groupby(level=0).max().rename(\"max_consec_same_route\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3d44e46-147b-433f-b985-5d6b3baefd6e", + "metadata": {}, + "outputs": [], + "source": [ + "orig = uf[\"from\"].astype(\"str\")\n", + "orig_prev = orig.groupby(uf[uid]).shift(1)\n", + "orig_change = orig_prev.isna() | (orig != orig_prev)\n", + "orig_seg = orig_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n", + "orig_run_sizes = uf.groupby([uid, orig_seg]).size().rename(\"orig_run_len\")\n", + "max_consec_same_origin = orig_run_sizes.groupby(level=0).max().rename(\"max_consec_same_origin\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52028ba2-bd19-44fc-a7eb-bf76be246b79", + "metadata": {}, + "outputs": [], + "source": [ + "longest_distance_km = uf.groupby(uid)[\"distance_km\"].max().rename(\"longest_distance_km\")\n", + "\n", + "gap_days = (gap / np.timedelta64(1, \"D\")).astype(\"float64\")\n", + "avg_days_between = gap_days.groupby(uf[uid]).mean().rename(\"avg_days_between_flights\")\n", + "\n", + "unique_airports = cudf.concat(\n", + " [\n", + " uf[[uid, \"from\"]].rename(columns={\"from\": \"airport\"}),\n", + " uf[[uid, \"to\"]].rename(columns={\"to\": \"airport\"}),\n", + " ],\n", + " ignore_index=True,\n", + ").groupby(uid)[\"airport\"].nunique().astype(\"int64\").rename(\"unique_airports\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "88e632af-278b-43af-a15d-7e9e9235afe6", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = (\n", + " max_streak_lt6h.to_frame()\n", + " .join(count_big_streaks_ge3, how=\"left\")\n", + " .join(max_consec_same_route, how=\"left\")\n", + " .join(max_consec_same_origin, how=\"left\")\n", + " .join(unique_airports, how=\"left\")\n", + " .join(longest_distance_km, how=\"left\")\n", + " .join(avg_days_between, how=\"left\")\n", + ").fillna({\n", + " \"max_streak_lt6h\": 0,\n", + " \"count_big_streaks_ge3\": 0,\n", + " \"max_consec_same_route\": 0,\n", + " \"max_consec_same_origin\": 0,\n", + " \"unique_airports\": 0,\n", + "})\n", + "\n", + "users_analytics = users.set_index(\"id\").join(metrics, how=\"left\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "951fe3b3-dac6-4c3a-8e83-80a048f5864d", + "metadata": {}, + "outputs": [], + "source": [ + "users_analytics.to_csv(f\"{DIR}/users_analytics_gpu_cudf.csv\", index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86817955-42e9-480e-8ffd-2a628816f6a9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- cgit v1.2.3