practical-nlp
diff --git a/‎Ch10/01_BioBERT_Demo.ipynb‎
Lines changed: 1975 additions & 1914 deletions b/‎Ch10/01_BioBERT_Demo.ipynb‎
Lines changed: 1975 additions & 1914 deletions
diff --git a/‎Ch10/02_LexNLP.ipynb‎
Lines changed: 383 additions & 174 deletions b/‎Ch10/02_LexNLP.ipynb‎
Lines changed: 383 additions & 174 deletions
diff --git a/‎Ch10/03_FinBERT.ipynb‎
Lines changed: 300 additions & 309 deletions b/‎Ch10/03_FinBERT.ipynb‎
Lines changed: 300 additions & 309 deletions
diff --git a/‎Ch10/ch10-requirements.txt‎
Lines changed: 17 additions & 0 deletions b/‎Ch10/ch10-requirements.txt‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎Ch2/01_WebScraping_using_BeautifulSoup.ipynb‎
Lines changed: 62 additions & 18 deletions b/‎Ch2/01_WebScraping_using_BeautifulSoup.ipynb‎
Lines changed: 62 additions & 18 deletions
diff --git a/‎Ch2/03_Extracting_text_from_images_tesseract.ipynb‎
Lines changed: 69 additions & 10 deletions b/‎Ch2/03_Extracting_text_from_images_tesseract.ipynb‎
Lines changed: 69 additions & 10 deletions
@@ -0,0 +1,17 @@
+requests==2.23.0
+pytorch-transformers==1.2.0
+transformers==4.7.0
+pandas==1.1.5
+pytorch-pretrained-bert==0.6.2
+pytorch-nlp==0.5.0
+tensorflow==1.14.0
+torch==1.9.0
+keras==2.5.0
+scikit-learn==0.21.3
+tqdm==4.41.1
+matplotlib==3.2.2
+numpy==1.19.5
+nltk==3.5
+lexnlp==1.8.0
+textract==1.6.3
+wget==3.2
@@ -14,31 +14,75 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: numpy==1.19.5 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (1.19.5)\n",
+      "Requirement already satisfied: beautifulsoup4==4.6.3 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (4.6.3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n",
+    "\n",
+    "# ===========================\n",
+    "\n",
+    "!pip install numpy==1.19.5\n",
+    "!pip install beautifulsoup4==4.6.3\n",
+    "\n",
+    "# ==========================="
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n",
+    "\n",
+    "# ===========================\n",
+    "\n",
+    "# try :\n",
+    "#     import google.colab\n",
+    "#     !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch2/ch2-requirements.txt | xargs -n 1 -L 1 pip install\n",
+    "# except ModuleNotFoundError :\n",
+    "#     !pip install -r \"ch2-requirements.txt\"\n",
+    "\n",
+    "# ==========================="
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {
     "colab": {},
     "colab_type": "code",
     "id": "P610gMZrd8SE"
    },
    "outputs": [],
    "source": [
-    "#making the necessary imports\n",
+    "# making the necessary imports\n",
     "from pprint import pprint\n",
     "from bs4 import BeautifulSoup\n",
     "from urllib.request import urlopen "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {
     "colab": {},
     "colab_type": "code",
     "id": "jfwgiGjJeBSG"
    },
    "outputs": [],
    "source": [
-    "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" #specify the url\n",
-    "html = urlopen(myurl).read() #query the website so that it returns a html page  \n",
+    "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" # specify the url\n",
+    "html = urlopen(myurl).read() # query the website so that it returns a html page  \n",
     "soupified = BeautifulSoup(html, 'html.parser') # parse the html in the 'html' variable, and store it in Beautiful Soup format"
    ]
   },
@@ -51,16 +95,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#pprint(soupified.prettify())      #for printing the full HTML structure of the webpage"
+    "#pprint(soupified.prettify())      # for printing the full HTML structure of the webpage"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -77,7 +121,7 @@
      "output_type": "stream",
      "text": [
       "('<!DOCTYPE html>\\n'\n",
-      " '<html class=\"html__responsive\" itemscope=\"\" '\n",
+      " '<html class=\"html__responsive html__fixed-top-bar\" itemscope=\"\" '\n",
       " 'itemtype=\"https://schema.org/QAPage\">\\n'\n",
       " ' <head>\\n'\n",
       " '  <title>\\n'\n",
@@ -117,21 +161,21 @@
       " '    <script '\n",
       " 'src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js\">\\n'\n",
       " '    </script>\\n'\n",
-      " '    <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=b8a86b92f383\">\\n'\n",
+      " '    <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=8eabfaaa0deb\">\\n'\n",
       " '    </script>\\n'\n",
-      " '    <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=f0ad20c3c35c\" '\n",
+      " '    <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=b4e52b95973a\" '\n",
       " 'rel=\"stylesheet\" type=\"text/css\"/>\\n'\n",
-      " '    <link href=\"https://cdn.sstatic.ne')\n"
+      " '    <link href=\"ht')\n"
      ]
     }
    ],
    "source": [
-    "pprint(soupified.prettify()[:2000])#to get an idea of the html structure of the webpage "
+    "pprint(soupified.prettify()[:2000]) # to get an idea of the html structure of the webpage "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -148,18 +192,18 @@
        "<title>datetime - How to get the current time in Python - Stack Overflow</title>"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "soupified.title #to get the title of the web page "
+    "soupified.title # to get the title of the web page "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -201,11 +245,11 @@
     }
    ],
    "source": [
-    "question = soupified.find(\"div\", {\"class\": \"question\"}) #find the nevessary tag and class which it belongs to\n",
+    "question = soupified.find(\"div\", {\"class\": \"question\"}) # find the nevessary tag and class which it belongs to\n",
     "questiontext = question.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
     "print(\"Question: \\n\", questiontext.get_text().strip())\n",
     "\n",
-    "answer = soupified.find(\"div\", {\"class\": \"answer\"}) #find the nevessary tag and class which it belongs to\n",
+    "answer = soupified.find(\"div\", {\"class\": \"answer\"}) # find the nevessary tag and class which it belongs to\n",
     "answertext = answer.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
     "print(\"Best answer: \\n\", answertext.get_text().strip())"
    ]