1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
|
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "aa17ff76-7c65-44e2-b36e-23ad9ffe4c9c",
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import cupy as cp\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "170ea4a3-1b4d-41c3-9e17-fd8943dc6b31",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'status': 'ok', 'restart': True}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5a0b20f-ff99-401d-804c-568f0bfb2f11",
"metadata": {},
"outputs": [],
"source": [
"DIR = \"./data\"\n",
"BIG_KM = 2000.0\n",
"MAX_GAP_HOURS = 6\n",
"MAX_GAP = np.timedelta64(MAX_GAP_HOURS, \"h\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fc1282ce-b717-453e-bc64-9fbb758a2fdc",
"metadata": {},
"outputs": [],
"source": [
"users = cudf.read_csv(f\"{DIR}/users.csv\", dtype={\"id\": \"uint64\"})\n",
"flights = cudf.read_csv(f\"{DIR}/flights.csv\")\n",
"user_flights = cudf.read_csv(\n",
" f\"{DIR}/user_flights.csv\", dtype={\"user_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
")\n",
"cards = cudf.read_csv(\n",
" f\"{DIR}/cards.csv\", dtype={\"id\": \"uint64\", \"user_id\": \"uint64\", \"number\": \"uint64\"}\n",
")\n",
"card_flights = cudf.read_csv(\n",
" f\"{DIR}/card_flights.csv\", dtype={\"card_id\": \"uint64\", \"flight_id\": \"uint64\"}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdcaa3d1-6414-425f-9a40-f65f74c004b6",
"metadata": {},
"outputs": [],
"source": [
"has_time = flights[\"has_time\"].astype(\"str\").str.strip().str.lower()\n",
"has_time = (has_time == \"true\")\n",
"\n",
"date = flights[\"dep_date\"].fillna(\"\").astype(\"str\").str.strip()\n",
"time = flights[\"dep_time\"].fillna(\"\").astype(\"str\").str.strip()\n",
"\n",
"mask_date = date.str.match(r\"^\\d{4}-\\d{2}-\\d{2}$\")\n",
"mask_time = time.str.match(r\"^\\d{2}:\\d{2}(:\\d{2})?$\") \n",
"\n",
"mask_dt = mask_date & mask_time & has_time \n",
"mask_d = mask_date & (~has_time | ~mask_time) \n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "302947fa-af35-4fad-91bc-6da2468d9708",
"metadata": {},
"outputs": [],
"source": [
"for c in (\"fromlat\", \"fromlon\", \"tolat\", \"tolon\"):\n",
" flights[c] = flights[c].astype(\"float64\")\n",
"\n",
"R = cp.float64(6371.0088)\n",
"rad = cp.float64(cp.pi / 180.0)\n",
"\n",
"lat1 = flights[\"fromlat\"].values * rad\n",
"lon1 = flights[\"fromlon\"].values * rad\n",
"lat2 = flights[\"tolat\"].values * rad\n",
"lon2 = flights[\"tolon\"].values * rad\n",
"\n",
"dlat = lat2 - lat1\n",
"dlon = lon2 - lon1\n",
"a = cp.sin(dlat * 0.5)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon * 0.5)**2\n",
"flights[\"distance_km\"] = cudf.Series(R * (2.0 * cp.arcsin(cp.sqrt(a))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b141f399-4f52-40d0-a2f6-6434ab1ebb74",
"metadata": {},
"outputs": [],
"source": [
"map_user_flights = user_flights[[\"user_id\", \"flight_id\"]]\n",
"\n",
"cards_min = cards[[\"id\", \"user_id\"]].rename(columns={\"id\": \"card_id\"})\n",
"map_card_users = card_flights.merge(cards_min, on=\"card_id\", how=\"left\")[[\"user_id\", \"flight_id\"]]\n",
"\n",
"user_flight_map = cudf.concat([map_user_flights, map_card_users], ignore_index=True)\n",
"user_flight_map = user_flight_map.dropna(subset=[\"user_id\", \"flight_id\"]).drop_duplicates()\n",
"\n",
"uf = user_flight_map.merge(\n",
" flights,\n",
" left_on=\"flight_id\",\n",
" right_on=\"id\",\n",
" how=\"inner\", \n",
")\n",
"uf = uf.dropna(subset=[\"dep_ts\"]).sort_values([\"user_id\", \"dep_ts\", \"flight_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "0cb5b280-451b-4cf1-ad61-de990b490d63",
"metadata": {},
"outputs": [],
"source": [
"uid = \"user_id\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "44c7edf4-276d-49c2-a93d-2b8a60bcfaab",
"metadata": {},
"outputs": [],
"source": [
"prev_dep = uf.groupby(uid)[\"dep_ts\"].shift(1)\n",
"gap = uf[\"dep_ts\"] - prev_dep\n",
"cond_lt6h = gap <= MAX_GAP\n",
"is_new_seg = cond_lt6h.isna() | (~cond_lt6h)\n",
"seg_id = is_new_seg.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
"streak_sizes = uf.groupby([uid, seg_id]).size().rename(\"streak_len\")\n",
"max_streak_lt6h = streak_sizes.groupby(level=0).max().rename(\"max_streak_lt6h\")\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "6cbe6321-2033-413e-9dd4-305c339361fc",
"metadata": {},
"outputs": [],
"source": [
"big = uf[\"distance_km\"] >= BIG_KM\n",
"big_new = big.isna() | (~big)\n",
"big_seg = big_new.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
"big_sizes = uf[big].groupby([uid, big_seg]).size().rename(\"big_run_len\")\n",
"count_big_streaks_ge3 = (big_sizes >= 3).groupby(level=0).sum().astype(\"int64\").rename(\"count_big_streaks_ge3\")\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6668873f-f36d-4423-a735-d744647db88e",
"metadata": {},
"outputs": [],
"source": [
"route = uf[\"from\"].astype(\"str\") + \"→\" + uf[\"to\"].astype(\"str\")\n",
"route_prev = route.groupby(uf[uid]).shift(1)\n",
"route_change = route_prev.isna() | (route != route_prev)\n",
"route_seg = route_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
"route_run_sizes = uf.groupby([uid, route_seg]).size().rename(\"route_run_len\")\n",
"max_consec_same_route = route_run_sizes.groupby(level=0).max().rename(\"max_consec_same_route\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3d44e46-147b-433f-b985-5d6b3baefd6e",
"metadata": {},
"outputs": [],
"source": [
"orig = uf[\"from\"].astype(\"str\")\n",
"orig_prev = orig.groupby(uf[uid]).shift(1)\n",
"orig_change = orig_prev.isna() | (orig != orig_prev)\n",
"orig_seg = orig_change.astype(\"int32\").groupby(uf[uid]).cumsum()\n",
"orig_run_sizes = uf.groupby([uid, orig_seg]).size().rename(\"orig_run_len\")\n",
"max_consec_same_origin = orig_run_sizes.groupby(level=0).max().rename(\"max_consec_same_origin\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52028ba2-bd19-44fc-a7eb-bf76be246b79",
"metadata": {},
"outputs": [],
"source": [
"longest_distance_km = uf.groupby(uid)[\"distance_km\"].max().rename(\"longest_distance_km\")\n",
"\n",
"gap_days = (gap / np.timedelta64(1, \"D\")).astype(\"float64\")\n",
"avg_days_between = gap_days.groupby(uf[uid]).mean().rename(\"avg_days_between_flights\")\n",
"\n",
"unique_airports = cudf.concat(\n",
" [\n",
" uf[[uid, \"from\"]].rename(columns={\"from\": \"airport\"}),\n",
" uf[[uid, \"to\"]].rename(columns={\"to\": \"airport\"}),\n",
" ],\n",
" ignore_index=True,\n",
").groupby(uid)[\"airport\"].nunique().astype(\"int64\").rename(\"unique_airports\")\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "88e632af-278b-43af-a15d-7e9e9235afe6",
"metadata": {},
"outputs": [],
"source": [
"metrics = (\n",
" max_streak_lt6h.to_frame()\n",
" .join(count_big_streaks_ge3, how=\"left\")\n",
" .join(max_consec_same_route, how=\"left\")\n",
" .join(max_consec_same_origin, how=\"left\")\n",
" .join(unique_airports, how=\"left\")\n",
" .join(longest_distance_km, how=\"left\")\n",
" .join(avg_days_between, how=\"left\")\n",
").fillna({\n",
" \"max_streak_lt6h\": 0,\n",
" \"count_big_streaks_ge3\": 0,\n",
" \"max_consec_same_route\": 0,\n",
" \"max_consec_same_origin\": 0,\n",
" \"unique_airports\": 0,\n",
"})\n",
"\n",
"users_analytics = users.set_index(\"id\").join(metrics, how=\"left\")\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "951fe3b3-dac6-4c3a-8e83-80a048f5864d",
"metadata": {},
"outputs": [],
"source": [
"users_analytics.to_csv(f\"{DIR}/users_analytics_gpu_cudf.csv\", index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86817955-42e9-480e-8ffd-2a628816f6a9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|