|
14 | 14 | { |
15 | 15 | "cell_type": "code", |
16 | 16 | "execution_count": 1, |
| 17 | + "metadata": {}, |
| 18 | + "outputs": [ |
| 19 | + { |
| 20 | + "name": "stdout", |
| 21 | + "output_type": "stream", |
| 22 | + "text": [ |
| 23 | + "Requirement already satisfied: numpy==1.19.5 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (1.19.5)\n", |
| 24 | + "Requirement already satisfied: beautifulsoup4==4.6.3 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (4.6.3)\n" |
| 25 | + ] |
| 26 | + } |
| 27 | + ], |
| 28 | + "source": [ |
| 29 | + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", |
| 30 | + "\n", |
| 31 | + "# ===========================\n", |
| 32 | + "\n", |
| 33 | + "!pip install numpy==1.19.5\n", |
| 34 | + "!pip install beautifulsoup4==4.6.3\n", |
| 35 | + "\n", |
| 36 | + "# ===========================" |
| 37 | + ] |
| 38 | + }, |
| 39 | + { |
| 40 | + "cell_type": "code", |
| 41 | + "execution_count": 2, |
| 42 | + "metadata": {}, |
| 43 | + "outputs": [], |
| 44 | + "source": [ |
| 45 | + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", |
| 46 | + "\n", |
| 47 | + "# ===========================\n", |
| 48 | + "\n", |
| 49 | + "# try :\n", |
| 50 | + "# import google.colab\n", |
| 51 | + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch2/ch2-requirements.txt | xargs -n 1 -L 1 pip install\n", |
| 52 | + "# except ModuleNotFoundError :\n", |
| 53 | + "# !pip install -r \"ch2-requirements.txt\"\n", |
| 54 | + "\n", |
| 55 | + "# ===========================" |
| 56 | + ] |
| 57 | + }, |
| 58 | + { |
| 59 | + "cell_type": "code", |
| 60 | + "execution_count": 3, |
17 | 61 | "metadata": { |
18 | 62 | "colab": {}, |
19 | 63 | "colab_type": "code", |
20 | 64 | "id": "P610gMZrd8SE" |
21 | 65 | }, |
22 | 66 | "outputs": [], |
23 | 67 | "source": [ |
24 | | - "#making the necessary imports\n", |
| 68 | + "# making the necessary imports\n", |
25 | 69 | "from pprint import pprint\n", |
26 | 70 | "from bs4 import BeautifulSoup\n", |
27 | 71 | "from urllib.request import urlopen " |
28 | 72 | ] |
29 | 73 | }, |
30 | 74 | { |
31 | 75 | "cell_type": "code", |
32 | | - "execution_count": 2, |
| 76 | + "execution_count": 4, |
33 | 77 | "metadata": { |
34 | 78 | "colab": {}, |
35 | 79 | "colab_type": "code", |
36 | 80 | "id": "jfwgiGjJeBSG" |
37 | 81 | }, |
38 | 82 | "outputs": [], |
39 | 83 | "source": [ |
40 | | - "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" #specify the url\n", |
41 | | - "html = urlopen(myurl).read() #query the website so that it returns a html page \n", |
| 84 | + "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" # specify the url\n", |
| 85 | + "html = urlopen(myurl).read() # query the website so that it returns a html page \n", |
42 | 86 | "soupified = BeautifulSoup(html, 'html.parser') # parse the html in the 'html' variable, and store it in Beautiful Soup format" |
43 | 87 | ] |
44 | 88 | }, |
|
51 | 95 | }, |
52 | 96 | { |
53 | 97 | "cell_type": "code", |
54 | | - "execution_count": 3, |
| 98 | + "execution_count": 5, |
55 | 99 | "metadata": {}, |
56 | 100 | "outputs": [], |
57 | 101 | "source": [ |
58 | | - "#pprint(soupified.prettify()) #for printing the full HTML structure of the webpage" |
| 102 | + "#pprint(soupified.prettify()) # for printing the full HTML structure of the webpage" |
59 | 103 | ] |
60 | 104 | }, |
61 | 105 | { |
62 | 106 | "cell_type": "code", |
63 | | - "execution_count": 4, |
| 107 | + "execution_count": 6, |
64 | 108 | "metadata": { |
65 | 109 | "colab": { |
66 | 110 | "base_uri": "https://localhost:8080/", |
|
77 | 121 | "output_type": "stream", |
78 | 122 | "text": [ |
79 | 123 | "('<!DOCTYPE html>\\n'\n", |
80 | | - " '<html class=\"html__responsive\" itemscope=\"\" '\n", |
| 124 | + " '<html class=\"html__responsive html__fixed-top-bar\" itemscope=\"\" '\n", |
81 | 125 | " 'itemtype=\"https://schema.org/QAPage\">\\n'\n", |
82 | 126 | " ' <head>\\n'\n", |
83 | 127 | " ' <title>\\n'\n", |
|
117 | 161 | " ' <script '\n", |
118 | 162 | " 'src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js\">\\n'\n", |
119 | 163 | " ' </script>\\n'\n", |
120 | | - " ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=b8a86b92f383\">\\n'\n", |
| 164 | + " ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=8eabfaaa0deb\">\\n'\n", |
121 | 165 | " ' </script>\\n'\n", |
122 | | - " ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=f0ad20c3c35c\" '\n", |
| 166 | + " ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=b4e52b95973a\" '\n", |
123 | 167 | " 'rel=\"stylesheet\" type=\"text/css\"/>\\n'\n", |
124 | | - " ' <link href=\"https://cdn.sstatic.ne')\n" |
| 168 | + " ' <link href=\"ht')\n" |
125 | 169 | ] |
126 | 170 | } |
127 | 171 | ], |
128 | 172 | "source": [ |
129 | | - "pprint(soupified.prettify()[:2000])#to get an idea of the html structure of the webpage " |
| 173 | + "pprint(soupified.prettify()[:2000]) # to get an idea of the html structure of the webpage " |
130 | 174 | ] |
131 | 175 | }, |
132 | 176 | { |
133 | 177 | "cell_type": "code", |
134 | | - "execution_count": 5, |
| 178 | + "execution_count": 7, |
135 | 179 | "metadata": { |
136 | 180 | "colab": { |
137 | 181 | "base_uri": "https://localhost:8080/", |
|
148 | 192 | "<title>datetime - How to get the current time in Python - Stack Overflow</title>" |
149 | 193 | ] |
150 | 194 | }, |
151 | | - "execution_count": 5, |
| 195 | + "execution_count": 7, |
152 | 196 | "metadata": {}, |
153 | 197 | "output_type": "execute_result" |
154 | 198 | } |
155 | 199 | ], |
156 | 200 | "source": [ |
157 | | - "soupified.title #to get the title of the web page " |
| 201 | + "soupified.title # to get the title of the web page " |
158 | 202 | ] |
159 | 203 | }, |
160 | 204 | { |
161 | 205 | "cell_type": "code", |
162 | | - "execution_count": 6, |
| 206 | + "execution_count": 8, |
163 | 207 | "metadata": { |
164 | 208 | "colab": { |
165 | 209 | "base_uri": "https://localhost:8080/", |
|
201 | 245 | } |
202 | 246 | ], |
203 | 247 | "source": [ |
204 | | - "question = soupified.find(\"div\", {\"class\": \"question\"}) #find the nevessary tag and class which it belongs to\n", |
| 248 | + "question = soupified.find(\"div\", {\"class\": \"question\"}) # find the nevessary tag and class which it belongs to\n", |
205 | 249 | "questiontext = question.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n", |
206 | 250 | "print(\"Question: \\n\", questiontext.get_text().strip())\n", |
207 | 251 | "\n", |
208 | | - "answer = soupified.find(\"div\", {\"class\": \"answer\"}) #find the nevessary tag and class which it belongs to\n", |
| 252 | + "answer = soupified.find(\"div\", {\"class\": \"answer\"}) # find the nevessary tag and class which it belongs to\n", |
209 | 253 | "answertext = answer.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n", |
210 | 254 | "print(\"Best answer: \\n\", answertext.get_text().strip())" |
211 | 255 | ] |
|
0 commit comments