diff --git "a/scripts/02_Process_files.ipynb" "b/scripts/02_Process_files.ipynb" new file mode 100644--- /dev/null +++ "b/scripts/02_Process_files.ipynb" @@ -0,0 +1,2688 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state":{}, + "a0c6705f4fad4f519897e05a5de06a94": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d449a38929414a4cb4bebb2e01fbaf44", + "IPY_MODEL_eebed51a9c8b41939c686c261acc7e8a", + "IPY_MODEL_0f3a830648d8410894aa3dc47227992f" + ], + "layout": "IPY_MODEL_07a1742aee284bf5834ca77a5f03d3ee" + } + }, + "d449a38929414a4cb4bebb2e01fbaf44": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_789e5125b32f4a8390cff308983b878c", + "placeholder": "​", + "style": "IPY_MODEL_88eebcdcb5eb45bfbe040c8ccbf87411", + "value": "Crawler.zip: 100%" + } + }, + "eebed51a9c8b41939c686c261acc7e8a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_922f135bb50f4144ab0d6abad0df9e69", + "max": 439926114, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_68b6bed833c74ccea239002b23c404a4", + "value": 439926114 + } + }, + "0f3a830648d8410894aa3dc47227992f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ab181bde71d481fa20d3bd5c5d30803", + "placeholder": "​", + "style": "IPY_MODEL_5768e9e182a04c08b4e8887bc4bda5f1", + "value": " 440M/440M [00:04<00:00, 91.5MB/s]" + } + }, + "07a1742aee284bf5834ca77a5f03d3ee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "789e5125b32f4a8390cff308983b878c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "88eebcdcb5eb45bfbe040c8ccbf87411": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "922f135bb50f4144ab0d6abad0df9e69": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "68b6bed833c74ccea239002b23c404a4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3ab181bde71d481fa20d3bd5c5d30803": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5768e9e182a04c08b4e8887bc4bda5f1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "268c8a22fb2a4d2eab9c3e25de984221": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2d2b4736f11c433585b752139dd70dc9", + "IPY_MODEL_80d59424f44a4b199008e0d16343f4b0", + "IPY_MODEL_ed3db2b25f0e45a7a9cdf4405b1a87a1" + ], + "layout": "IPY_MODEL_df1e6015d489417485e3a28bbbd54728" + } + }, + "2d2b4736f11c433585b752139dd70dc9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f2c35ce7b4264cfa80ace120c2a8c2f9", + "placeholder": "​", + "style": "IPY_MODEL_25469448f7d6428ca97a96f9c9f156b7", + "value": "microsoft-learn.jsonl: 100%" + } + }, + "80d59424f44a4b199008e0d16343f4b0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6586ec5c75454671b85bd4659fd9a40e", + "max": 270257457, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_11273ae865b44f2b8d3a4363b3fa1178", + "value": 270257457 + } + }, + "ed3db2b25f0e45a7a9cdf4405b1a87a1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e36a5f615ae948148aa1222b0ddc514b", + "placeholder": "​", + "style": "IPY_MODEL_e64d5d3198634727a6a9f3634fb4afb8", + "value": " 270M/270M [00:07<00:00, 40.9MB/s]" + } + }, + "df1e6015d489417485e3a28bbbd54728": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f2c35ce7b4264cfa80ace120c2a8c2f9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "25469448f7d6428ca97a96f9c9f156b7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6586ec5c75454671b85bd4659fd9a40e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "11273ae865b44f2b8d3a4363b3fa1178": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e36a5f615ae948148aa1222b0ddc514b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e64d5d3198634727a6a9f3634fb4afb8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "079339c01ce24e8ab8a61d24b36ee4e8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_888a7e6f633746be94943016dc9f7c56", + "IPY_MODEL_b41df158b85c4222ba2d65279337105d", + "IPY_MODEL_3bc4979197e94257a83d7f0deeafa43a" + ], + "layout": "IPY_MODEL_2ec303b4d09d41ed83102edec73026f0" + } + }, + "888a7e6f633746be94943016dc9f7c56": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b6b1bdbd8f64ad4b458bede86f7f3c6", + "placeholder": "​", + "style": "IPY_MODEL_11f49e72f98c4d6aa69247d0e1661a2b", + "value": "tech-community.jsonl: 100%" + } + }, + "b41df158b85c4222ba2d65279337105d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf34ba8bf7bc4dbcb7152ecb22229c78", + "max": 2136201679, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_06f2266f798544adb127055305f1671f", + "value": 2136201679 + } + }, + "3bc4979197e94257a83d7f0deeafa43a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_328ccb5aaad64381acf906d3f2f39434", + "placeholder": "​", + "style": "IPY_MODEL_1a556d4a74b44803a88eb88b01f78abf", + "value": " 2.14G/2.14G [00:58<00:00, 52.8MB/s]" + } + }, + "2ec303b4d09d41ed83102edec73026f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b6b1bdbd8f64ad4b458bede86f7f3c6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "11f49e72f98c4d6aa69247d0e1661a2b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bf34ba8bf7bc4dbcb7152ecb22229c78": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "06f2266f798544adb127055305f1671f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "328ccb5aaad64381acf906d3f2f39434": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1a556d4a74b44803a88eb88b01f78abf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e954c157384742568aeb1783495a652a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6265a7718d4d472f895fa8a55de6facf", + "IPY_MODEL_6d550f52e424492f8928886802018388", + "IPY_MODEL_ae40a367b9af4d9f8fdd96cbdad2cf27" + ], + "layout": "IPY_MODEL_e927f83749c6446d8734bb2de88e429b" + } + }, + "6265a7718d4d472f895fa8a55de6facf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cebf15f03d694f7ca7a38febf76cb054", + "placeholder": "​", + "style": "IPY_MODEL_cc5663d199724c83b893e3ce965d466d", + "value": "azure-updates.jsonl: 100%" + } + }, + "6d550f52e424492f8928886802018388": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f01c1e76140e4797bc79e0880db73304", + "max": 18574751, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_34ff77d08c3d43b98d2d3855543a89dd", + "value": 18574751 + } + }, + "ae40a367b9af4d9f8fdd96cbdad2cf27": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f08ea4973a31409faf9c07751b5d5dd3", + "placeholder": "​", + "style": "IPY_MODEL_790fb4ae984446d9bdeec2a2876807ff", + "value": " 18.6M/18.6M [00:00<00:00, 76.6MB/s]" + } + }, + "e927f83749c6446d8734bb2de88e429b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cebf15f03d694f7ca7a38febf76cb054": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc5663d199724c83b893e3ce965d466d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f01c1e76140e4797bc79e0880db73304": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34ff77d08c3d43b98d2d3855543a89dd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f08ea4973a31409faf9c07751b5d5dd3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "790fb4ae984446d9bdeec2a2876807ff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "185ded41057340a08a5bccc551798f1a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5d1143e67464431aa5a96d4c01ae9b73", + "IPY_MODEL_d29703f8bf6d49188cb5a2bd3a5629eb", + "IPY_MODEL_bdc511cd9dcf44f09af118fa11b9c5c3" + ], + "layout": "IPY_MODEL_ed8fadcdcde4412b98c68e8d1924eaf0" + } + }, + "5d1143e67464431aa5a96d4c01ae9b73": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9e3eca47a25f43e69bcf0c9fd3962cae", + "placeholder": "​", + "style": "IPY_MODEL_bac81645f79848c8a7fc30e21ba6fcce", + "value": "github-samples.jsonl: 100%" + } + }, + "d29703f8bf6d49188cb5a2bd3a5629eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f03e86202bd4fc9b2d13e67ee0c7447", + "max": 171553392, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eb2f160c34814ff1b7aa8a1ad3d42edb", + "value": 171553392 + } + }, + "bdc511cd9dcf44f09af118fa11b9c5c3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3c08de811fbb422eb70ea751acb2e13d", + "placeholder": "​", + "style": "IPY_MODEL_1626f1da378045399f650e6fb1ba0429", + "value": " 172M/172M [00:04<00:00, 39.9MB/s]" + } + }, + "ed8fadcdcde4412b98c68e8d1924eaf0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9e3eca47a25f43e69bcf0c9fd3962cae": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bac81645f79848c8a7fc30e21ba6fcce": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2f03e86202bd4fc9b2d13e67ee0c7447": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eb2f160c34814ff1b7aa8a1ad3d42edb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3c08de811fbb422eb70ea751acb2e13d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1626f1da378045399f650e6fb1ba0429": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2aef948c3e248bca2a6b4a84d8534b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e8e7a3c6cac44588a44ec5f4bd3c2771", + "IPY_MODEL_8cd41c5175c546b0a576d7f1f07b31f0", + "IPY_MODEL_fdd2a5d14b5045e6b3ff9f585c12c1b1" + ], + "layout": "IPY_MODEL_c74f16c144274bfc81f5484c9d5abf84" + } + }, + "e8e7a3c6cac44588a44ec5f4bd3c2771": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6663b46404c447ec82eb7b755f46e3f4", + "placeholder": "​", + "style": "IPY_MODEL_dcdc3d1d68f14c34931e1d257590ca70", + "value": "azure-architecture.jsonl: 100%" + } + }, + "8cd41c5175c546b0a576d7f1f07b31f0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6567ebb2bce04053812751d9bb0db2da", + "max": 15568877, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bbc373b2b34a447c85bc4983b4d62390", + "value": 15568877 + } + }, + "fdd2a5d14b5045e6b3ff9f585c12c1b1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_93590d191413459e878207a0176393be", + "placeholder": "​", + "style": "IPY_MODEL_0a674086b9244ba6ac5400b10c3b7a69", + "value": " 15.6M/15.6M [00:00<00:00, 70.2MB/s]" + } + }, + "c74f16c144274bfc81f5484c9d5abf84": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6663b46404c447ec82eb7b755f46e3f4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dcdc3d1d68f14c34931e1d257590ca70": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6567ebb2bce04053812751d9bb0db2da": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bbc373b2b34a447c85bc4983b4d62390": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "93590d191413459e878207a0176393be": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0a674086b9244ba6ac5400b10c3b7a69": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9QUghTFp3gSE", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0ffbdf26-4600-408e-ceb4-9c7191d71066" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/454.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m450.6/454.8 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.8/454.8 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m37.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m70.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m266.8/266.8 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.2/304.2 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.9/50.9 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.3/129.3 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install -q llama-index==0.12.12 openai==1.59.6 tiktoken==0.8.0" + ] + }, + { + "cell_type": "code", + "source": [ + "# set variables\n", + "from google.colab import userdata\n", + "\n", + "HF_TOKEN = userdata.get('HF_TOKEN2')" + ], + "metadata": { + "id": "hxGaRPQc3vrZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Initialise HG\n", + "\n", + "from huggingface_hub import HfApi\n", + "\n", + "api = HfApi(token=HF_TOKEN)" + ], + "metadata": { + "id": "87eDAI2w34d4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + " # Download files\n", + "file_name = 'Crawler.zip'\n", + "api.hf_hub_download(\n", + " filename=file_name,\n", + " local_dir=\"./data\",\n", + " repo_id=\"vicpada/AzureResources\",\n", + " repo_type=\"dataset\"\n", + ")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67, + "referenced_widgets": [ + "a0c6705f4fad4f519897e05a5de06a94", + "d449a38929414a4cb4bebb2e01fbaf44", + "eebed51a9c8b41939c686c261acc7e8a", + "0f3a830648d8410894aa3dc47227992f", + "07a1742aee284bf5834ca77a5f03d3ee", + "789e5125b32f4a8390cff308983b878c", + "88eebcdcb5eb45bfbe040c8ccbf87411", + "922f135bb50f4144ab0d6abad0df9e69", + "68b6bed833c74ccea239002b23c404a4", + "3ab181bde71d481fa20d3bd5c5d30803", + "5768e9e182a04c08b4e8887bc4bda5f1" + ] + }, + "id": "N45xj1ic4ENn", + "outputId": "627e0d3c-38a3-4290-d4d0-e66dbaf72f6e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Crawler.zip: 0%| | 0.00/440M [00:00 int:\n", + " encoding = tiktoken.get_encoding(encoding_name)\n", + " num_tokens = len(encoding.encode(string, disallowed_special=()))\n", + " return num_tokens\n", + "\n", + "def remove_copyright_header(content: str) -> str:\n", + " header_pattern = re.compile(r\"\\s*\", re.DOTALL)\n", + " cleaned_content = header_pattern.sub(\"\", content, count=1)\n", + " return cleaned_content.strip()\n", + "\n", + "def remove_url_and_title_header(content: str) -> str:\n", + " header_pattern = re.compile(r\"(?s)^---\\s*(?=.*\\b(url|title):).*?---\\s*\\n*\", re.DOTALL)\n", + " cleaned_content = header_pattern.sub(\"\", content, count=1)\n", + " return cleaned_content.strip()\n", + "\n", + "\n", + "\n", + "def process_files(folder_name:str, folder_path:str, files:List) -> List[dict[str, str]]:\n", + " jsonl_data = []\n", + " for file_name in files:\n", + " file_path = os.path.join(folder_path, file_name)\n", + " with open(file_path, 'r', encoding='utf-8') as infile:\n", + " content = infile.read()\n", + "\n", + " # Create a Json object\n", + " title = extract_title(content)\n", + " token_count = num_tokens_from_string(content, \"cl100k_base\")\n", + "\n", + " # Extract URL and Title, handling potential None results from re.search\n", + " url_match = re.search(r'^url:\\s*\"([^\"]+)\"', content, re.MULTILINE)\n", + " extracted_url = url_match.group(1) if url_match else None\n", + "\n", + " title_match = re.search(r'^title:\\s*\"([^\"]+)\"', content, re.MULTILINE)\n", + " extracted_title = title_match.group(1) if title_match else extract_title(content) # Use extract_title function as fallback\n", + "\n", + " # Skip very small or extremely large files\n", + " if token_count < 100 or token_count > 200_000:\n", + " print(\n", + " f\"Skipping {file_path} due to token count {token_count}\"\n", + " )\n", + " continue\n", + "\n", + " cleaned_content = remove_copyright_header(content)\n", + " cleaned_content = remove_url_and_title_header(content)\n", + "\n", + " json_object = {\n", + " \"tokens\": token_count,\n", + " \"doc_id\" :str(uuid.uuid5(uuid.NAMESPACE_DNS, cleaned_content)),\n", + " \"name\": (extracted_title if extracted_title else file_name),\n", + " \"url\": extracted_url,\n", + " \"retrieve_doc\": (token_count <= 8000),\n", + " \"source\": folder_name,\n", + " \"content\": cleaned_content,\n", + " }\n", + " jsonl_data.append(json_object)\n", + " return jsonl_data\n", + "\n", + "\n", + "\n", + "# Iterate through each folder in the extracted directory\n", + "for folder_name in os.listdir(extract_dir):\n", + " folder_path = os.path.join(extract_dir, folder_name)\n", + "\n", + " # Check if it's a directory\n", + " if os.path.isdir(folder_path):\n", + " jsonl_filename = f\"{folder_name}.jsonl\"\n", + " jsonl_filepath = os.path.join(output_dir, jsonl_filename)\n", + "\n", + " with open(jsonl_filepath, 'w') as outfile:\n", + " # Iterate through each file in the current folder\n", + " json_data = process_files(folder_name, folder_path, os.listdir(folder_path))\n", + " for json_object in json_data:\n", + " json_str = json.dumps(json_object)\n", + " outfile.write(json_str + '\\n')\n", + "\n", + " print(f\"Created JSONL file: {jsonl_filepath}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Z2Mn64dN5It6", + "outputId": "244b2d68-97e8-4fba-aae1-3a90b31b86aa" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2658_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22_t5_s_gxcuf89792_rss_Community_%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_why-microsoft-store_icid=footer_why-msft-store_7102020_%22.md due to token count 79\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___portal.office.com_landing_%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_education_%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___sharegate.com_microsoft-migration_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___bit.ly_SVS17CHI_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.avepoint.com_events_webinar_office-365-delegating-administration_%22.md due to token count 69\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___jmp.sh_s_jQ7HKR3DamYmG0AuwYLM_%22.md due to token count 70\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___openssl-library.org_news_secadv_20250211.txt_%22.md due to token count 63\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___choice.microsoft.com_%22.md due to token count 55\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.facebook.com_share.php_u=page.url&t=page-name_%22.md due to token count 67\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___account.microsoft.com_orders_%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_linkid=2196227%5C%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__www.linkedin.com_sharing_share-offsite_url=%7Bpage.url%7D%5C%22.md due to token count 72\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_en-in_fasttrack_data-migration_%22.md due to token count 62\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-laptop-go-2_8PGLPV76MJHN_%22.md due to token count 73\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_p_LinkID=824764&clcid=0x409%5C%22.md due to token count 73\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2656_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22javascript_void(0)_%22.md due to token count 25\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___cmd.ms__%22.md due to token count 55\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_linkid=2139749%5C%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___docs.microsoft.com_en-us_powershell_module_microsoft.graph.identity.directorymanagement_update-mgorganizationbranding_view=graph-powershell-beta_%22.md due to token count 82\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_business-consultation_tab=educationconsultation&icid=CNavfooter_educationconsultation_%22.md due to token count 80\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2662_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___appsource.microsoft.com_en-us__%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_sitemap1.aspx_%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2655_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___aka.ms_microsoftpurview_%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_diversity__%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_microsoft-365_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-laptop-5_8XN49V61S1BN_%22.md due to token count 74\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___support.microsoft.com__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.reddit.com_submit_url=page.url&title=page-name_%22.md due to token count 67\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_en-us_office_dev_add-ins_develop_automatically-open-a-task-pane-with-a-document_%22.md due to token count 74\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2661_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_security_%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_en-us_microsoftsearch_configure-connector_%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___cloudpartners.transform.microsoft.com_resources_fasttrack_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___news.microsoft.com__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_microsoft-365_business__%22.md due to token count 63\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_investor_default.aspx_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-laptop-studio_8SRDF62SWKPF_%22.md due to token count 72\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_docs__%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_sustainability__%22.md due to token count 30\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___developer.microsoft.com_en-us__%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_security_blog_2022_04_19_the-future-of-compliance-and-data-governance-is-here-introducing-microsoft-purview__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_education_buy-license_microsoft365_%22.md due to token count 65\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___privacy.microsoft.com_en-us_%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___help.bittitan.com_hc_en-us_articles_360011170593-Google-Drive-to-OneDrive-for-Business-v2-Migration-Guide_%22.md due to token count 83\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.nucleustechnologies.com_office-365-migration__%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_industry_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___azure.microsoft.com_en-us_free_students__%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___answers.microsoft.com_%22.md due to token count 55\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___techcommunity.microsoft.com_t5_FastTrack-Blog_Microsoft-Ignite-2018-recap-and-join-us-for-Microsoft-Ignite-The_ba-p_269474_%22.md due to token count 90\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___powerplatform.microsoft.com_en-us__%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___bsky.app_intent_compose_text=page-name%21%20%F0%9F%A6%8B%0Apage.url_%22.md due to token count 85\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_accessibility_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___visualstudio.microsoft.com__%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___support.office.com_office-training-center_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_business_icid=CNavBusinessStore_%22.md due to token count 68\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___twitter.com_hashtag_PowerApps_src=hashtag_click_%22.md due to token count 67\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-pro-9_93VKD8NP4FVK_%22.md due to token count 72\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___login.microsoftonline.com_443_common_oauth2_v2.0_token_%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___aka.ms_o365update-youtube_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___twitter.com_share_text=page-name&url=page.url_%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2657_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___fasttrack.microsoft.com_%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-studio-2plus_8VLFQC3597K4_%22.md due to token count 74\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___azure.microsoft.com_en-us__%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_39802_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_LinkId=521839%5C%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___education.microsoft.com__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___techcommunity.microsoft.com__%22.md due to token count 27\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_education_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22javascript_manageConsent();_%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___fto365dev.blob.core.windows.net_media_Default_Resources_SharePoint%202013%20Migration%20Offer%20FAQ_Public_September2016.pdf_%22.md due to token count 86\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___azuremarketplace.microsoft.com_en-us__%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_education_devices_overview_%22.md due to token count 62\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___Server_FQDN_certsrv_mscep_mscep.dll_%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___answers.microsoft.com_en-us_msoffice_forum_all_migrating-email-from-one-office365-account-to_da67871d-10b3-40d3-9108-cbd0a020c917_%22.md due to token count 96\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___docs.microsoft.com_en-us_sharepointmigration_migrating-content-to-onedrive-for-business_%22.md due to token count 70\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_en-us_microsoft-365_enterprise_cross-tenant-mailbox-migration_view=o365-worldwide_%22.md due to token count 78\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_about_%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___azure.microsoft.com_updates_%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___coolesicht.de_likesgivento_fdp.svg_%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_workshops-training-and-events_icid=vl_uf_932020_%22.md due to token count 74\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_d_surface-duo-2_9408KGXP4XJL_%22.md due to token count 73\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___XXXXXX.sharepoint.com_sites_apps_%22.md due to token count 59\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___fto365dev.blob.core.windows.net_media_Default_DocResources_en-us_Office_365_Adoption_Guide.pdf_%22.md due to token count 77\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_payment-financing-options_icid=footer_financing_vcc_%22.md due to token count 73\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_download_%22.md due to token count 58\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_microsoft-cloud_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___fasttrack.microsoft.com_office_envision_productivitylibrary_%22.md due to token count 63\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___account.microsoft.com__%22.md due to token count 56\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___docs.microsoft.com_en-us_answers_products_azure_product=all_%22.md due to token count 66\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___docs.microsoft.com_en-us_microsoft-365_security_office-365-security_step-by-step-guides_step-by-step-guide-overview_view=o365-worldwide_%22.md due to token count 87\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22mailto__body=page.url_%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2660_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_39804_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2663_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___technet.microsoft.com_en-us_library_mt651701.aspx_%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_windows_windows-11-apps_%22.md due to token count 61\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___dynamics.microsoft.com_en-us__%22.md due to token count 29\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__setup.cloud.microsoft_Q=TechCommunityBlog%5C%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___techcommunity.microsoft.com_t5_fasttrack-blog_deploying-microsoft-365-just-got-easier-introducing-advanced_ba-p_3635421_%22.md due to token count 85\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___techcommunity.microsoft.com_t5_microsoft-365-blog_announcing-the-preview-of-the-software-updates-page-in-the_ba-p_3465361_%22.md due to token count 84\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_LinkID=206977%5C%22.md due to token count 35\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___aka.ms_yourcaliforniaprivacychoices_%22.md due to token count 63\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___bit.ly_O365AUGChiMU_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___careers.microsoft.com__%22.md due to token count 27\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_39803_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___learn.microsoft.com_en-us_microsoftsearch_mssql-connector_%22.md due to token count 64\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___support.microsoft.com_contactus_%22.md due to token count 57\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_education_products_teams_%22.md due to token count 62\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_microsoft-teams_group-chat-software_%22.md due to token count 65\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___XXXXXX.sharepoint.com_sites_fasttrack_%22.md due to token count 60\n", + "Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https__go.microsoft.com_fwlink_linkid=2196228%5C%22.md due to token count 66\n", + "Created JSONL file: ./data/jsonl_output/tech-community.jsonl\n", + "Created JSONL file: ./data/jsonl_output/azure-architecture.jsonl\n", + "Skipping ./data/extracted/azure-updates/azure.microsoft.com_en-us_pricing_details_databricks_.md due to token count 268457\n", + "Created JSONL file: ./data/jsonl_output/azure-updates.jsonl\n", + "Skipping ./data/extracted/microsoft-learn/learn.microsoft.com_en-us_azure_governance_policy_samples_built-in-policies.md due to token count 338431\n", + "Created JSONL file: ./data/jsonl_output/microsoft-learn.jsonl\n", + "Skipping ./data/extracted/github-samples/github.com_Azure-Samples_graphrag-accelerator_blob_157e7af9b8cf29b8ea50019b9aff6bd6f6f1ba0b_backend_poetry.lock.md due to token count 204958\n", + "Skipping ./data/extracted/github-samples/github.com_Azure-Samples_openai_blob_21f3d94334c823e42d7adf8578cebc0674363fe1_Solution_Accelerators_Advanced_RAG_src_session_manager_nszrxvuw.component-detection-pip-report.json.md due to token count 388353\n", + "Skipping ./data/extracted/github-samples/github.com_Azure-Samples_openai_blob_21f3d94334c823e42d7adf8578cebc0674363fe1_Solution_Accelerators_Advanced_RAG_src_orchestrator_rag_0kanp4rh.component-detection-pip-report.json.md due to token count 423339\n", + "Created JSONL file: ./data/jsonl_output/github-samples.jsonl\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# prompt: upload to hugging face\n", + "\n", + "# List files in the output directory\n", + "output_files = [f for f in os.listdir(output_dir) if f.endswith('.jsonl')]\n", + "\n", + "# Upload each JSONL file to Hugging Face Datasets\n", + "for file_name in output_files:\n", + " file_path = os.path.join(output_dir, file_name)\n", + " try:\n", + " api.upload_file(\n", + " path_or_fileobj=file_path,\n", + " path_in_repo=file_name,\n", + " repo_id=\"vicpada/AzureResources\", # Replace with your repo ID\n", + " repo_type=\"dataset\",\n", + " commit_message=f\"Add {file_name}\"\n", + " )\n", + " print(f\"Successfully uploaded {file_name}\")\n", + " except Exception as e:\n", + " print(f\"Error uploading {file_name}: {e}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 264, + "referenced_widgets": [ + "268c8a22fb2a4d2eab9c3e25de984221", + "2d2b4736f11c433585b752139dd70dc9", + "80d59424f44a4b199008e0d16343f4b0", + "ed3db2b25f0e45a7a9cdf4405b1a87a1", + "df1e6015d489417485e3a28bbbd54728", + "f2c35ce7b4264cfa80ace120c2a8c2f9", + "25469448f7d6428ca97a96f9c9f156b7", + "6586ec5c75454671b85bd4659fd9a40e", + "11273ae865b44f2b8d3a4363b3fa1178", + "e36a5f615ae948148aa1222b0ddc514b", + "e64d5d3198634727a6a9f3634fb4afb8", + "079339c01ce24e8ab8a61d24b36ee4e8", + "888a7e6f633746be94943016dc9f7c56", + "b41df158b85c4222ba2d65279337105d", + "3bc4979197e94257a83d7f0deeafa43a", + "2ec303b4d09d41ed83102edec73026f0", + "6b6b1bdbd8f64ad4b458bede86f7f3c6", + "11f49e72f98c4d6aa69247d0e1661a2b", + "bf34ba8bf7bc4dbcb7152ecb22229c78", + "06f2266f798544adb127055305f1671f", + "328ccb5aaad64381acf906d3f2f39434", + "1a556d4a74b44803a88eb88b01f78abf", + "e954c157384742568aeb1783495a652a", + "6265a7718d4d472f895fa8a55de6facf", + "6d550f52e424492f8928886802018388", + "ae40a367b9af4d9f8fdd96cbdad2cf27", + "e927f83749c6446d8734bb2de88e429b", + "cebf15f03d694f7ca7a38febf76cb054", + "cc5663d199724c83b893e3ce965d466d", + "f01c1e76140e4797bc79e0880db73304", + "34ff77d08c3d43b98d2d3855543a89dd", + "f08ea4973a31409faf9c07751b5d5dd3", + "790fb4ae984446d9bdeec2a2876807ff", + "185ded41057340a08a5bccc551798f1a", + "5d1143e67464431aa5a96d4c01ae9b73", + "d29703f8bf6d49188cb5a2bd3a5629eb", + "bdc511cd9dcf44f09af118fa11b9c5c3", + "ed8fadcdcde4412b98c68e8d1924eaf0", + "9e3eca47a25f43e69bcf0c9fd3962cae", + "bac81645f79848c8a7fc30e21ba6fcce", + "2f03e86202bd4fc9b2d13e67ee0c7447", + "eb2f160c34814ff1b7aa8a1ad3d42edb", + "3c08de811fbb422eb70ea751acb2e13d", + "1626f1da378045399f650e6fb1ba0429", + "f2aef948c3e248bca2a6b4a84d8534b5", + "e8e7a3c6cac44588a44ec5f4bd3c2771", + "8cd41c5175c546b0a576d7f1f07b31f0", + "fdd2a5d14b5045e6b3ff9f585c12c1b1", + "c74f16c144274bfc81f5484c9d5abf84", + "6663b46404c447ec82eb7b755f46e3f4", + "dcdc3d1d68f14c34931e1d257590ca70", + "6567ebb2bce04053812751d9bb0db2da", + "bbc373b2b34a447c85bc4983b4d62390", + "93590d191413459e878207a0176393be", + "0a674086b9244ba6ac5400b10c3b7a69" + ] + }, + "id": "bfPafnarDngV", + "outputId": "b9f87732-701f-4508-9ebf-a52e0a4919b0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "microsoft-learn.jsonl: 0%| | 0.00/270M [00:00