From f60863aecfdfb2a7a35c9d2d4233142ca17c9152 Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Tue, 9 Dec 2025 10:10:48 +0300 Subject: rapids ok --- .../25.11.25.md | 33 +++++++++++++++++++++ .../3-05_knn.md | 19 ++++++++++++ .../3-06_xgboost.ipynb | 7 +++++ .../3-06_xgboost.md | 4 +++ .../3-06_xgboost.png | Bin 0 -> 101671 bytes 5 files changed, 63 insertions(+) create mode 100644 Fundamentals_of_Accelerated_Data_Science/25.11.25.md create mode 100644 Fundamentals_of_Accelerated_Data_Science/3-05_knn.md create mode 100644 Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.md create mode 100644 Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.png diff --git a/Fundamentals_of_Accelerated_Data_Science/25.11.25.md b/Fundamentals_of_Accelerated_Data_Science/25.11.25.md new file mode 100644 index 0000000..8de5afc --- /dev/null +++ b/Fundamentals_of_Accelerated_Data_Science/25.11.25.md @@ -0,0 +1,33 @@ +3.04 как корректируются коэффициенты, функция ошибок + +3.06 энтропия, джини (splitting criteria) +subsample 0.8 для чего помогает +f score для переменной +ROC curve + + +https://mlu-explain.github.io/logistic-regression/ +A common way to estimate coefficients is to use gradient descent. In gradient descent, the goal is to minimize the Log-Loss cost function over all samples. This method involves selecting initial parameter values, and then updating them incrementally by moving them in the direction that decreases the loss. At each iteration, the parameter value is updated by the gradient, scaled by the step size (otherwise known as the learning rate). The gradient is the vector encompassing the direction and rate of the fastest increase of a function, which can be calculated using partial derivatives. The parameters are updated in the opposite direction of the gradient by the step size in an attempt to find the parameter values that minimize the Log-Loss. + +Because the gradient calculates where the function is increasing, going in the opposite direction leads us to the minimum of our function. In this manner, we can repeatedly update our model's coefficients such that we eventually reach the minimum of our error function and obtain a sigmoid curve that fits our data well. + +https://www.stat.cmu.edu/~cshalizi/uADA/12/lectures/ch12.pdf + + +https://medium.com/data-science/optimization-loss-function-under-the-hood-part-ii-d20a239cde11 + + + + + + +https://xgboost.readthedocs.io/en/stable/parameter.html +Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration. + + +ROC (Receiver Operating Characteristic) +https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc + + + +https://www.learndatasci.com/glossary/gini-impurity/ \ No newline at end of file diff --git a/Fundamentals_of_Accelerated_Data_Science/3-05_knn.md b/Fundamentals_of_Accelerated_Data_Science/3-05_knn.md new file mode 100644 index 0000000..5fdf591 --- /dev/null +++ b/Fundamentals_of_Accelerated_Data_Science/3-05_knn.md @@ -0,0 +1,19 @@ +lazy learner +деревья для поиска ближайших значений (KD, Q, R, Ball) + +- k-dimensional tree +Recursively splits data along coordinate axes. +at each node pick a dimension, split at median, each node becomes a hyperrectangle in d-dimensional space. + +- Quad-Tree +Each node is a square region. If too many points fall into a region subdivide into 4 equal quadrants + +- rectangle tree +Each node stores several children, each with a Minimum Bounding Rectangle covering many points or objects. Subtrees group spatially close objects. Insertions try to minimize rectangle enlargement. + +- Ball-Tree +Each node contains: + - a center point + - a radius enclosing all points in that node +Two children represent two balls that split the data + diff --git a/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.ipynb b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.ipynb index d3548f2..604e881 100644 --- a/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.ipynb +++ b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.ipynb @@ -742,6 +742,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://ru.wikipedia.org/wiki/%D0%94%D0%B2%D0%BE%D0%B8%D1%87%D0%BD%D0%B0%D1%8F_%D0%BA%D0%BB%D0%B0%D1%81%D1%81%D0%B8%D1%84%D0%B8%D0%BA%D0%B0%D1%86%D0%B8%D1%8F#/media/%D0%A4%D0%B0%D0%B9%D0%BB:Binary-classification-labeled.svg" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.md b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.md new file mode 100644 index 0000000..d5694e3 --- /dev/null +++ b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.md @@ -0,0 +1,4 @@ +https://neerc.ifmo.ru/wiki/index.php?title=XGBoost + + +энтропия джини \ No newline at end of file diff --git a/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.png b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.png new file mode 100644 index 0000000..fdbebde Binary files /dev/null and b/Fundamentals_of_Accelerated_Data_Science/3-06_xgboost.png differ -- cgit v1.2.3