aboutsummaryrefslogtreecommitdiff
path: root/py.ipynb
blob: b903e36113b9a23c15a12a960225c789f136885f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa17ff76-7c65-44e2-b36e-23ad9ffe4c9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import cupy as cp\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "170ea4a3-1b4d-41c3-9e17-fd8943dc6b31",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'status': 'ok', 'restart': True}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import IPython\n",
    "app = IPython.Application.instance()\n",
    "app.kernel.do_shutdown(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5a0b20f-ff99-401d-804c-568f0bfb2f11",
   "metadata": {},
   "outputs": [],
   "source": [
    "DIR = \"./data\"\n",
    "BIG_KM = 2000.0\n",
    "MAX_GAP_HOURS = 6\n",
    "MAX_GAP = np.timedelta64(MAX_GAP_HOURS, \"h\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fc1282ce-b717-453e-bc64-9fbb758a2fdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "users = cudf.read_csv(f\"{DIR}/users.csv\", dtype={\"id\": \"uint64\"})\n",
    "flights = cudf.read_csv(f\"{DIR}/flights.csv\")\n",
    "user_flights = cudf.read_csv(\n",
    "    f\"{DIR}/user_flights.csv\", dtype={\"user_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
    ")\n",
    "cards = cudf.read_csv(\n",
    "    f\"{DIR}/cards.csv\", dtype={\"id\": \"uint64\", \"user_id\": \"uint64\", \"number\": \"uint64\"}\n",
    ")\n",
    "card_flights = cudf.read_csv(\n",
    "    f\"{DIR}/card_flights.csv\", dtype={\"card_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdcaa3d1-6414-425f-9a40-f65f74c004b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "has_time = flights[\"has_time\"].astype(\"str\").str.strip().str.lower()\n",
    "has_time = (has_time == \"true\")\n",
    "\n",
    "date = flights[\"dep_date\"].fillna(\"\").astype(\"str\").str.strip()\n",
    "time = flights[\"dep_time\"].fillna(\"\").astype(\"str\").str.strip()\n",
    "\n",
    "mask_date = date.str.match(r\"^\\d{4}-\\d{2}-\\d{2}$\")\n",
    "mask_time = time.str.match(r\"^\\d{2}:\\d{2}(:\\d{2})?$\") \n",
    "\n",
    "mask_dt = mask_date & mask_time & has_time          \n",
    "mask_d  = mask_date & (~has_time | ~mask_time)     \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "302947fa-af35-4fad-91bc-6da2468d9708",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in (\"fromlat\", \"fromlon\", \"tolat\", \"tolon\"):\n",
    "    flights[c] = flights[c].astype(\"float64\")\n",
    "\n",
    "R = cp.float64(6371.0088)\n",
    "rad = cp.float64(cp.pi / 180.0)\n",
    "\n",
    "lat1 = flights[\"fromlat\"].values * rad\n",
    "lon1 = flights[\"fromlon\"].values * rad\n",
    "lat2 = flights[\"tolat\"].values  * rad\n",
    "lon2 = flights[\"tolon\"].values  * rad\n",
    "\n",
    "dlat = lat2 - lat1\n",
    "dlon = lon2 - lon1\n",
    "a = cp.sin(dlat * 0.5)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon * 0.5)**2\n",
    "flights[\"distance_km\"] = cudf.Series(R * (2.0 * cp.arcsin(cp.sqrt(a))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b141f399-4f52-40d0-a2f6-6434ab1ebb74",
   "metadata": {},
   "outputs": [],
   "source": [
    "map_user_flights = user_flights[[\"user_id\", \"flight_id\"]]\n",
    "\n",
    "cards_min = cards[[\"id\", \"user_id\"]].rename(columns={\"id\": \"card_id\"})\n",
    "map_card_users = card_flights.merge(cards_min, on=\"card_id\", how=\"left\")[[\"user_id\", \"flight_id\"]]\n",
    "\n",
    "user_flight_map = cudf.concat([map_user_flights, map_card_users], ignore_index=True)\n",
    "user_flight_map = user_flight_map.dropna(subset=[\"user_id\", \"flight_id\"]).drop_duplicates()\n",
    "\n",
    "uf = user_flight_map.merge(\n",
    "    flights,\n",
    "    left_on=\"flight_id\",\n",
    "    right_on=\"id\",\n",
    "    how=\"inner\",             \n",
    ")\n",
    "uf = uf.dropna(subset=[\"dep_ts\"]).sort_values([\"user_id\", \"dep_ts\", \"flight_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0cb5b280-451b-4cf1-ad61-de990b490d63",
   "metadata": {},
   "outputs": [],
   "source": [
    "uid = \"user_id\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "44c7edf4-276d-49c2-a93d-2b8a60bcfaab",
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_dep = uf.groupby(uid)[\"dep_ts\"].shift(1)\n",
    "gap = uf[\"dep_ts\"] - prev_dep\n",
    "cond_lt6h = gap <= MAX_GAP\n",
    "is_new_seg = cond_lt6h.isna() | (~cond_lt6h)\n",
    "seg_id = is_new_seg.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
    "streak_sizes = uf.groupby([uid, seg_id]).size().rename(\"streak_len\")\n",
    "max_streak_lt6h = streak_sizes.groupby(level=0).max().rename(\"max_streak_lt6h\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6cbe6321-2033-413e-9dd4-305c339361fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "big = uf[\"distance_km\"] >= BIG_KM\n",
    "big_new = big.isna() | (~big)\n",
    "big_seg = big_new.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
    "big_sizes = uf[big].groupby([uid, big_seg]).size().rename(\"big_run_len\")\n",
    "count_big_streaks_ge3 = (big_sizes >= 3).groupby(level=0).sum().astype(\"int64\").rename(\"count_big_streaks_ge3\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6668873f-f36d-4423-a735-d744647db88e",
   "metadata": {},
   "outputs": [],
   "source": [
    "route = uf[\"from\"].astype(\"str\") + \"→\" + uf[\"to\"].astype(\"str\")\n",
    "route_prev = route.groupby(uf[uid]).shift(1)\n",
    "route_change = route_prev.isna() | (route != route_prev)\n",
    "route_seg = route_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
    "route_run_sizes = uf.groupby([uid, route_seg]).size().rename(\"route_run_len\")\n",
    "max_consec_same_route = route_run_sizes.groupby(level=0).max().rename(\"max_consec_same_route\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3d44e46-147b-433f-b985-5d6b3baefd6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig = uf[\"from\"].astype(\"str\")\n",
    "orig_prev = orig.groupby(uf[uid]).shift(1)\n",
    "orig_change = orig_prev.isna() | (orig != orig_prev)\n",
    "orig_seg = orig_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
    "orig_run_sizes = uf.groupby([uid, orig_seg]).size().rename(\"orig_run_len\")\n",
    "max_consec_same_origin = orig_run_sizes.groupby(level=0).max().rename(\"max_consec_same_origin\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52028ba2-bd19-44fc-a7eb-bf76be246b79",
   "metadata": {},
   "outputs": [],
   "source": [
    "longest_distance_km = uf.groupby(uid)[\"distance_km\"].max().rename(\"longest_distance_km\")\n",
    "\n",
    "gap_days = (gap / np.timedelta64(1, \"D\")).astype(\"float64\")\n",
    "avg_days_between = gap_days.groupby(uf[uid]).mean().rename(\"avg_days_between_flights\")\n",
    "\n",
    "unique_airports = cudf.concat(\n",
    "    [\n",
    "        uf[[uid, \"from\"]].rename(columns={\"from\": \"airport\"}),\n",
    "        uf[[uid, \"to\"]].rename(columns={\"to\": \"airport\"}),\n",
    "    ],\n",
    "    ignore_index=True,\n",
    ").groupby(uid)[\"airport\"].nunique().astype(\"int64\").rename(\"unique_airports\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "88e632af-278b-43af-a15d-7e9e9235afe6",
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics = (\n",
    "    max_streak_lt6h.to_frame()\n",
    "    .join(count_big_streaks_ge3, how=\"left\")\n",
    "    .join(max_consec_same_route,  how=\"left\")\n",
    "    .join(max_consec_same_origin, how=\"left\")\n",
    "    .join(unique_airports,        how=\"left\")\n",
    "    .join(longest_distance_km,    how=\"left\")\n",
    "    .join(avg_days_between,       how=\"left\")\n",
    ").fillna({\n",
    "    \"max_streak_lt6h\": 0,\n",
    "    \"count_big_streaks_ge3\": 0,\n",
    "    \"max_consec_same_route\": 0,\n",
    "    \"max_consec_same_origin\": 0,\n",
    "    \"unique_airports\": 0,\n",
    "})\n",
    "\n",
    "users_analytics = users.set_index(\"id\").join(metrics, how=\"left\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "951fe3b3-dac6-4c3a-8e83-80a048f5864d",
   "metadata": {},
   "outputs": [],
   "source": [
    "users_analytics.to_csv(f\"{DIR}/users_analytics_gpu_cudf.csv\", index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86817955-42e9-480e-8ffd-2a628816f6a9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}