@@ -29,15 +29,31 @@ const skip = new Set([
2929    // Not in HTML 
3030    25 ,  54 , 
3131] ) ; 
32+ const  MAX_CONCURRENCY  =  5 ; 
33+ const  REFETCH_OLD_VERSIONS  =  false ; 
3234
33- async . each ( range ( 1 ,  MAX_REPORT ) ,  ( num ,  cb )  =>  { 
35+ async . eachLimit ( range ( 1 ,  MAX_REPORT ) ,   MAX_CONCURRENCY ,  ( num ,  cb )  =>  { 
3436    if  ( skip . has ( num ) )  { 
3537        console . log ( 'Skipping report #'  +  num ) ; 
3638        cb ( ) ; 
3739        return ; 
3840    } 
3941
40-     const  url  =  `https://www.unicode.org/reports/tr${ num }  /` ; 
42+     recurseStandard ( num ,  `https://www.unicode.org/reports/tr${ num }  /` ,  null ,  cb ) ; 
43+ } ,  ( err )  =>  { 
44+     if  ( err )  { 
45+         console . log ( 'there was an error' ) ; 
46+         console . error ( err ) ; 
47+         return ; 
48+     } 
49+     const  output  =  { } ; 
50+     for  ( const  key  of  Object . keys ( current ) . sort ( ) )  { 
51+         output [ key ]  =  current [ key ] ; 
52+     } 
53+     helper . writeBiblio ( FILENAME ,  output ) ; 
54+ } ) ; 
55+ 
56+ function  recurseStandard ( num ,  url ,  latestId ,  cb )  { 
4157    console . log ( 'Fetching' ,  url ,  '...' ) ; 
4258    request ( { 
4359        url, 
@@ -53,13 +69,7 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369        console . log ( 'Parsing' ,  url ,  '...' ) ; 
5470        const  dom  =  new  JSDOM ( body ,  {  url } ) ; 
5571        const  {  document }  =  dom . window ; 
56-         const  type  =  document . title . slice ( 0 ,  3 ) ; 
57-         if  ( type  !==  'UTS'  &&  type  !==  'UTR'  &&  type  !==  'UAX' )  { 
58-             console . log ( 'Unable to parse title' ,  document . title ) ; 
59-             cb ( ) ; 
60-             return ; 
61-         } 
62-         const  id  =  type  +  num ; 
72+ 
6373        const  statusEl  =  document . querySelector ( '.body > h2' ) ; 
6474        if  ( ! statusEl )  { 
6575            console . log ( 'Unable to find status' ) ; 
@@ -68,6 +78,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6878        } 
6979        const  status  =  trimText ( statusEl . textContent ) ; 
7080
81+         let  type  =  document . title . match ( / \b ( U T S | U T R | U A X ) / ) ; 
82+         if  ( type  !==  'UTS'  &&  type  !==  'UTR'  &&  type  !==  'UAX' )  { 
83+             // Fallback for https://www.unicode.org/reports/tr35/ 
84+             const  lowerStatus  =  status . toLowerCase ( ) ; 
85+             if  ( lowerStatus . indexOf ( 'technical standard' )  !=  - 1 )  { 
86+                 type  =  'UTS' ; 
87+             }  else  if  ( lowerStatus . indexOf ( 'standard annex' )  !=  - 1 )  { 
88+                 type  =  'UAX' ; 
89+             }  else  if  ( lowerStatus . indexOf ( 'technical report' )  !=  - 1 )  { 
90+                 type  =  'UTR' ; 
91+             }  else  { 
92+                 console . log ( 'Unable to parse document type' ) ; 
93+                 cb ( ) ; 
94+                 return ; 
95+             } 
96+         } 
97+         const  thisId  =  type  +  num ; 
98+ 
7199        const  titleEl  =  statusEl . nextElementSibling ; 
72100        if  ( ! titleEl  ||  titleEl . tagName  !==  'H1' )  { 
73101            console . log ( 'Unable to find title' ) ; 
@@ -86,62 +114,106 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
86114            return ; 
87115        } 
88116
117+         if  ( latestId  ==  null )  { 
118+             // This is first scanned document, so the latest version. 
119+             latestId  =  thisId ; 
120+ 
121+             const  authors  =  infoTable . Editor  &&  parseEditor ( infoTable . Editor ) ; 
122+             if  ( ! authors )  { 
123+                 console . log ( 'Unable to find/parse editors in table' ) ; 
124+                 cb ( ) ; 
125+                 return ; 
126+             } 
127+ 
128+             current [ thisId ]  =  { 
129+                 href : url , 
130+                 authors, 
131+                 etAl : authors . etAl , 
132+                 title, 
133+                 status, 
134+                 publisher : 'Unicode Consortium' , 
135+                 versions : current [ latestId ] ?. versions  ??  { } 
136+             } ; 
137+         }  else  if  ( thisId  !=  latestId )  { 
138+             // The document was renamed at some point - create link 
139+             current [ thisId ]  =  {  aliasOf : latestId  } ; 
140+         } 
141+ 
89142        const  date  =  trimText ( infoTable . Date ) ; 
90-         if  ( ! date )  { 
143+         if  ( ! date   ||   ! / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) )  { 
91144            console . log ( 'Unable to find date in table' ) ; 
92145            cb ( ) ; 
93146            return ; 
94147        } 
95-         let  isRawDate  =  / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) ; 
96148
97-         const  href  =  processURL ( infoTable [ 'This Version' ]  ||  url ) ; 
149+         const  href  =  processURL ( infoTable [ 'This Version' ] ) ; 
150+         if  ( ! href )  { 
151+             console . log ( 'Failed to extract version URL' ) ; 
152+             cb ( ) ; 
153+             return ; 
154+         } 
98155
99-         const  authors  =  infoTable . Editor   &&   parseEditor ( infoTable . Editor ) ; 
100-         if  ( ! authors )  { 
101-             console . log ( 'Unable  to find/parse editors in table ' ) ; 
156+         const  revision  =  parseRevision ( href ) ; 
157+         if  ( ! revision )  { 
158+             console . log ( 'Failed  to extract revision ' ) ; 
102159            cb ( ) ; 
103160            return ; 
104161        } 
105162
106-         if   ( type   !==   'UAX'   &&   current [ `UAX ${ num } ` ] ) 
107-              current [ `UAX ${ num } ` ]   =   {   aliasOf :  id   } ; 
108-         if  ( type   !==   'UTR'   &&   current [ `UTR ${ num } ` ] ) 
109-             current [ `UTR ${ num } ` ]   =   {   aliasOf :  id   } ; 
110-         if   ( type   !==   'UTS'   &&   current [ `UTS ${ num } ` ] ) 
111-             current [ `UTS ${ num } ` ]   =   {   aliasOf :  id   } ; 
163+         const   version   =   parseVersion ( infoTable . Version ) ; 
164+ 
165+         if  ( version ) 
166+             title   =   ` ${ title }  version  ${ version } ` ; 
167+         else 
168+             title   =   ` ${ title }  revision  ${ revision } ` ; 
112169
113-         current [ id ]  =  { 
114-             authors, 
115-             etAl : authors . etAl , 
170+         const  wasAlreadyDefined  =  revision  in  current [ latestId ] . versions ; 
171+         current [ latestId ] . versions [ revision ]  =  { 
116172            href, 
173+             rawDate : date , 
117174            title, 
118-             date : isRawDate  ? undefined  : date , 
119-             rawDate : isRawDate  ? date  : undefined , 
120-             status, 
121-             publisher : 'Unicode Consortium' 
175+             status : current [ latestId ] . status  !=  status  ? status  : undefined , 
122176        } ; 
177+ 
178+         /* 
179+          * If this revision was already defined, then don't waste time and bandwidth fetching 
180+          * previous revisions which should have no changes. 
181+          * 
182+          * We're running this check after updating the information for this version in case this 
183+          * is the latest and is a WIP, as we have already downloaded it anyway. 
184+          */ 
185+         if  ( ! wasAlreadyDefined  ||  REFETCH_OLD_VERSIONS )  { 
186+             const  previousUrl  =  processURL ( infoTable [ 'Previous Version' ] ) ; 
187+             if  ( previousUrl )  { 
188+                 recurseStandard ( num ,  previousUrl ,  latestId ,  cb ) ; 
189+                 return ; 
190+             } 
191+         } 
123192        cb ( ) ; 
124193    } ) ; 
125- } ,  ( err )  =>  { 
126-     if  ( err )  { 
127-         console . log ( 'there was an error' ) ; 
128-         console . error ( err ) ; 
129-         return ; 
130-     } 
131-     const  output  =  { } ; 
132-     for  ( const  key  of  Object . keys ( current ) . sort ( ) )  { 
133-         output [ key ]  =  current [ key ] ; 
134-     } 
135-     helper . writeBiblio ( FILENAME ,  output ) ; 
136- } ) ; 
194+ } 
137195
138196function *  range ( from ,  until )  { 
139197    for  ( let  i  =  from ;  i  <=  until ;  i ++ ) 
140198        yield  i ; 
141199} 
142200
143201function  trimText ( str )  { 
144-     return  str . replace ( / ® / g,  '' ) . trim ( ) . replace ( / \s + / g,  ' ' ) ; 
202+     if  ( ! str ) 
203+         return  str ; 
204+     str  =  str . replace ( / ® / g,  '' ) . trim ( ) ; 
205+ 
206+     /* 
207+      * Replace consecutive newlines (with any surrounding spaces) with a single newline. 
208+      * Technically the first [\s--\n]* could be simply \s* but writing it this way avoids 
209+      * heavy backtracking for long stretches of spaces. 
210+      */ 
211+     str  =  str . replace ( / [ \s - - \n ] * \n \s * / gv,  '\n' ) ; 
212+ 
213+     // Now replace all other spans of spaces, excluding new lines, with a single space 
214+     str  =  str . replace ( / [ \s - - \n ] + / gv,  ' ' ) ; 
215+ 
216+     return  str ; 
145217} 
146218
147219function  titleCase ( str )  { 
@@ -154,9 +226,9 @@ function gatherText(element) {
154226        if  ( node . nodeType  ===  node . ELEMENT_NODE  &&  node . tagName  ===  'BR' ) 
155227            str  +=  '\n' ; 
156228        else 
157-             str  +=  trimText ( node . textContent )   +   ' ' ; 
229+             str  +=  node . textContent ; 
158230    } 
159-     return  str ; 
231+     return  trimText ( str ) ; 
160232} 
161233
162234function  parseTable ( tableEl )  { 
@@ -173,7 +245,16 @@ function parseTable(tableEl) {
173245} 
174246
175247function  processURL ( str )  { 
176-     return  trimText ( str ) . replace ( / ^ h t t p : / ,  'https:' ) ; 
248+     if  ( ! str ) 
249+         return  null ; 
250+     str  =  trimText ( str ) ; 
251+     /* 
252+      * Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and 
253+      * others, where it is "n/a". 
254+      */ 
255+     if  ( str . substring ( 0 ,  4 )  !=  'http' ) 
256+         return  null ; 
257+     return  str . replace ( / ^ h t t p : / ,  'https:' ) ; 
177258} 
178259
179260function  parseEditor ( str )  { 
@@ -184,3 +265,22 @@ function parseEditor(str) {
184265    } 
185266    return  arr ; 
186267} 
268+ 
269+ function  parseRevision ( url )  { 
270+     if  ( ! url ) 
271+         return  null ; 
272+     /* 
273+      * Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases: 
274+      *   - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?) 
275+      *   - /tr<num>/tr<num>-<rev>.html (all others) 
276+      */ 
277+     const  match  =  url . match ( / \/ ( t r \d + ) \/ \1- (?< rev > \d + ) / ,  url ) ; 
278+     return  match  ? match . groups . rev  : null ; 
279+ } 
280+ 
281+ function  parseVersion ( str )  { 
282+     if  ( ! str ) 
283+         return  null ; 
284+     // Some have "Unicode 11.0.0" instead of the version alone. Strip it. 
285+     return  trimText ( str ) . replace ( / ^ U n i c o d e \s * / ,  '' ) ; 
286+ } 
0 commit comments