@@ -29,15 +29,31 @@ const skip = new Set([
2929    // Not in HTML 
3030    25 ,  54 , 
3131] ) ; 
32+ const  MAX_CONCURRENCY  =  10 ; 
33+ const  REFETCH_OLD_VERSIONS  =  false ; 
3234
33- async . each ( range ( 1 ,  MAX_REPORT ) ,  ( num ,  cb )  =>  { 
35+ async . eachLimit ( range ( 1 ,  MAX_REPORT ) ,   MAX_CONCURRENCY ,  ( num ,  cb )  =>  { 
3436    if  ( skip . has ( num ) )  { 
3537        console . log ( 'Skipping report #'  +  num ) ; 
3638        cb ( ) ; 
3739        return ; 
3840    } 
3941
40-     const  url  =  `https://www.unicode.org/reports/tr${ num }  /` ; 
42+     recurseStandard ( num ,  `https://www.unicode.org/reports/tr${ num }  /` ,  null ,  cb ) ; 
43+ } ,  ( err )  =>  { 
44+     if  ( err )  { 
45+         console . log ( 'there was an error' ) ; 
46+         console . error ( err ) ; 
47+         return ; 
48+     } 
49+     const  output  =  { } ; 
50+     for  ( const  key  of  Object . keys ( current ) . sort ( ) )  { 
51+         output [ key ]  =  current [ key ] ; 
52+     } 
53+     helper . writeBiblio ( FILENAME ,  output ) ; 
54+ } ) ; 
55+ 
56+ function  recurseStandard ( num ,  url ,  latestId ,  cb )  { 
4157    console . log ( 'Fetching' ,  url ,  '...' ) ; 
4258    request ( { 
4359        url, 
@@ -53,13 +69,15 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
5369        console . log ( 'Parsing' ,  url ,  '...' ) ; 
5470        const  dom  =  new  JSDOM ( body ,  {  url } ) ; 
5571        const  {  document }  =  dom . window ; 
56-         const  type  =  document . title . slice ( 0 ,  3 ) ; 
57-         if  ( type  !==  'UTS'  &&  type  !==  'UTR'  &&  type  !==  'UAX' )  { 
58-             console . log ( 'Unable to parse title' ,  document . title ) ; 
72+ 
73+         const  infoTableEl  =  document . querySelector ( '.body > table' ) ; 
74+         const  infoTable  =  infoTableEl  &&  parseTable ( infoTableEl ) ; 
75+         if  ( ! infoTable )  { 
76+             console . log ( 'Unable to find information table' ) ; 
5977            cb ( ) ; 
6078            return ; 
6179        } 
62-          const   id   =   type   +   num ; 
80+ 
6381        const  statusEl  =  document . querySelector ( '.body > h2' ) ; 
6482        if  ( ! statusEl )  { 
6583            console . log ( 'Unable to find status' ) ; 
@@ -68,6 +86,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
6886        } 
6987        const  status  =  trimText ( statusEl . textContent ) ; 
7088
89+         let  type  =  document . title . match ( / \b ( U T S | U T R | U A X ) / ) ; 
90+         if  ( type  !==  'UTS'  &&  type  !==  'UTR'  &&  type  !==  'UAX' )  { 
91+             // Fallback for https://www.unicode.org/reports/tr35/ 
92+             const  lowerStatus  =  status . toLowerCase ( ) ; 
93+             if  ( lowerStatus . indexOf ( 'technical standard' )  !=  - 1 )  { 
94+                 type  =  'UTS' ; 
95+             }  else  if  ( lowerStatus . indexOf ( 'standard annex' )  !=  - 1 )  { 
96+                 type  =  'UAX' ; 
97+             }  else  if  ( lowerStatus . indexOf ( 'technical report' )  !=  - 1 )  { 
98+                 type  =  'UTR' ; 
99+             }  else  { 
100+                 console . log ( 'Unable to parse document type' ) ; 
101+                 cb ( ) ; 
102+                 return ; 
103+             } 
104+         } 
105+         const  thisId  =  type  +  num ; 
106+ 
71107        const  titleEl  =  statusEl . nextElementSibling ; 
72108        if  ( ! titleEl  ||  titleEl . tagName  !==  'H1' )  { 
73109            console . log ( 'Unable to find title' ) ; 
@@ -78,69 +114,101 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
78114        if  ( ! / [ a - z ] / . test ( title ) ) 
79115            title  =  titleCase ( title ) ; 
80116
81-         const  infoTableEl  =  document . querySelector ( '.body > table' ) ; 
82-         const  infoTable  =  infoTableEl  &&  parseTable ( infoTableEl ) ; 
83-         if  ( ! infoTable )  { 
84-             console . log ( 'Unable to find information table' ) ; 
117+         if  ( latestId  ==  null )  { 
118+             // This is first scanned document, so the latest version. 
119+             latestId  =  thisId ; 
120+ 
121+             const  authors  =  infoTable . Editor  &&  parseEditor ( infoTable . Editor ) ; 
122+             if  ( ! authors )  { 
123+                 console . log ( 'Unable to find/parse editors in table' ) ; 
124+                 cb ( ) ; 
125+                 return ; 
126+             } 
127+ 
128+             current [ thisId ]  =  { 
129+                 href : url , 
130+                 authors, 
131+                 etAl : authors . etAl , 
132+                 title, 
133+                 status, 
134+                 publisher : 'Unicode Consortium' , 
135+                 versions : current [ latestId ] ?. versions  ??  { } 
136+             } ; 
137+         }  else  if  ( thisId  !=  latestId )  { 
138+             // The document was renamed at some point - create link 
139+             current [ thisId ]  =  {  aliasOf : latestId  } ; 
140+         } 
141+ 
142+         const  href  =  processURL ( infoTable [ 'This Version' ] ) ; 
143+         if  ( ! href )  { 
144+             console . log ( 'Failed to extract version URL' ) ; 
85145            cb ( ) ; 
86146            return ; 
87147        } 
88148
89-         const  date  =  trimText ( infoTable . Date ) ; 
90-         if  ( ! date )  { 
91-             console . log ( 'Unable  to find date in table ' ) ; 
149+         const  revision  =  parseRevision ( href ) ; 
150+         if  ( ! revision )  { 
151+             console . log ( 'Failed  to extract revision ' ) ; 
92152            cb ( ) ; 
93153            return ; 
94154        } 
95-         let  isRawDate  =  / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) ; 
96- 
97-         const  href  =  processURL ( infoTable [ 'This Version' ]  ||  url ) ; 
98155
99-         const  authors  =  infoTable . Editor  &&  parseEditor ( infoTable . Editor ) ; 
100-         if  ( ! authors )  { 
101-             console . log ( 'Unable to find/parse editors in table' ) ; 
156+         if  ( ! infoTable . Date )  { 
157+             console . log ( 'Unable to find date in table' ) ; 
158+             cb ( ) ; 
159+             return ; 
160+         } 
161+         /* 
162+          * Replace all spaces. We cannot simply trim as https://www.unicode.org/reports/tr57/tr57-2.html 
163+          * contains "2024- 07-01" due to the coloring. 
164+          */ 
165+         const  rawDate  =  infoTable . Date . replace ( / \s / g,  '' ) ; 
166+         if  ( ! / \d { 4 } - \d { 2 } - \d { 2 } / . test ( rawDate ) )  { 
167+             console . log ( 'Unable to parse data in table' ) ; 
102168            cb ( ) ; 
103169            return ; 
104170        } 
105171
106-         if  ( type  !==  'UAX'  &&  current [ `UAX${ num }  ` ] ) 
107-             current [ `UAX${ num }  ` ]  =  {  aliasOf : id  } ; 
108-         if  ( type  !==  'UTR'  &&  current [ `UTR${ num }  ` ] ) 
109-             current [ `UTR${ num }  ` ]  =  {  aliasOf : id  } ; 
110-         if  ( type  !==  'UTS'  &&  current [ `UTS${ num }  ` ] ) 
111-             current [ `UTS${ num }  ` ]  =  {  aliasOf : id  } ; 
172+         const  version  =  parseVersion ( infoTable . Version ) ; 
173+         if  ( version ) 
174+             title  =  `${ title }   version ${ version }  ` ; 
175+         else 
176+             title  =  `${ title }   revision ${ revision }  ` ; 
112177
113-         current [ id ]  =  { 
114-             authors, 
115-             etAl : authors . etAl , 
178+         const  wasAlreadyDefined  =  revision  in  current [ latestId ] . versions ; 
179+         current [ latestId ] . versions [ revision ]  =  { 
116180            href, 
181+             rawDate, 
117182            title, 
118-             date : isRawDate  ? undefined  : date , 
119-             rawDate : isRawDate  ? date  : undefined , 
120-             status, 
121-             publisher : 'Unicode Consortium' 
183+             status : current [ latestId ] . status  !=  status  ? status  : undefined , 
122184        } ; 
185+ 
186+         /* 
187+          * If this revision was already defined, then don't waste time and bandwidth fetching 
188+          * previous revisions which should have no changes. 
189+          * 
190+          * We're running this check after updating the information for this version in case this 
191+          * is the latest and is a WIP, as we have already downloaded it anyway. 
192+          */ 
193+         if  ( ! wasAlreadyDefined  ||  REFETCH_OLD_VERSIONS )  { 
194+             const  previousUrl  =  processURL ( infoTable [ 'Previous Version' ] ) ; 
195+             if  ( previousUrl )  { 
196+                 recurseStandard ( num ,  previousUrl ,  latestId ,  cb ) ; 
197+                 return ; 
198+             } 
199+         } 
123200        cb ( ) ; 
124201    } ) ; 
125- } ,  ( err )  =>  { 
126-     if  ( err )  { 
127-         console . log ( 'there was an error' ) ; 
128-         console . error ( err ) ; 
129-         return ; 
130-     } 
131-     const  output  =  { } ; 
132-     for  ( const  key  of  Object . keys ( current ) . sort ( ) )  { 
133-         output [ key ]  =  current [ key ] ; 
134-     } 
135-     helper . writeBiblio ( FILENAME ,  output ) ; 
136- } ) ; 
202+ } 
137203
138204function *  range ( from ,  until )  { 
139205    for  ( let  i  =  from ;  i  <=  until ;  i ++ ) 
140206        yield  i ; 
141207} 
142208
143209function  trimText ( str )  { 
210+     if  ( ! str ) 
211+         return  str ; 
144212    return  str . replace ( / ® / g,  '' ) . trim ( ) . replace ( / \s + / g,  ' ' ) ; 
145213} 
146214
@@ -154,9 +222,9 @@ function gatherText(element) {
154222        if  ( node . nodeType  ===  node . ELEMENT_NODE  &&  node . tagName  ===  'BR' ) 
155223            str  +=  '\n' ; 
156224        else 
157-             str  +=  trimText ( node . textContent )   +   ' ' ; 
225+             str  +=  node . textContent ; 
158226    } 
159-     return  str ; 
227+     return  trimText ( str ) ; 
160228} 
161229
162230function  parseTable ( tableEl )  { 
@@ -173,7 +241,16 @@ function parseTable(tableEl) {
173241} 
174242
175243function  processURL ( str )  { 
176-     return  trimText ( str ) . replace ( / ^ h t t p : / ,  'https:' ) ; 
244+     if  ( ! str ) 
245+         return  null ; 
246+     str  =  trimText ( str ) ; 
247+     /* 
248+      * Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and 
249+      * others, where it is "n/a". 
250+      */ 
251+     if  ( str . substring ( 0 ,  4 )  !=  'http' ) 
252+         return  null ; 
253+     return  str . replace ( / ^ h t t p : / ,  'https:' ) ; 
177254} 
178255
179256function  parseEditor ( str )  { 
@@ -184,3 +261,22 @@ function parseEditor(str) {
184261    } 
185262    return  arr ; 
186263} 
264+ 
265+ function  parseRevision ( url )  { 
266+     if  ( ! url ) 
267+         return  null ; 
268+     /* 
269+      * Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases: 
270+      *   - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?) 
271+      *   - /tr<num>/tr<num>-<rev>.html (all others) 
272+      */ 
273+     const  match  =  url . match ( / \/ ( t r \d + ) \/ \1- (?< rev > \d + ) / ,  url ) ; 
274+     return  match  ? match . groups . rev  : null ; 
275+ } 
276+ 
277+ function  parseVersion ( str )  { 
278+     if  ( ! str ) 
279+         return  null ; 
280+     // Some have "Unicode 11.0.0" instead of the version alone. Strip it. 
281+     return  trimText ( str ) . replace ( / ^ U n i c o d e \s * / ,  '' ) ; 
282+ } 
0 commit comments