@@ -633,3 +633,161 @@ pub mod ascii {
633633 )
634634 }
635635}
636+
637+ pub mod utf16_le {
638+ use super :: * ;
639+
640+ pub const ENCODING_NAME : & str = "utf-16-le" ;
641+
642+ pub fn encode < Ctx , E > ( mut ctx : Ctx , errors : & E ) -> Result < Vec < u8 > , Ctx :: Error >
643+ where
644+ Ctx : EncodeContext ,
645+ E : EncodeErrorHandler < Ctx > ,
646+ {
647+ let mut out = Vec :: < u8 > :: new ( ) ;
648+ loop {
649+ let data = ctx. remaining_data ( ) ;
650+ let error_info = {
651+ let mut iter = iter_code_points ( data) ;
652+ iter. find ( |( _, c) | c. to_u32 ( ) > 0x10FFFF )
653+ } ;
654+ let Some ( ( i, ch) ) = error_info else {
655+ break ;
656+ } ;
657+
658+ // Add valid part up to the error
659+ for ch in data[ ..i. bytes ] . code_points ( ) {
660+ let ch_u32 = ch. to_u32 ( ) ;
661+ if ch_u32 <= 0xFFFF {
662+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
663+ } else if ch_u32 <= 0x10FFFF {
664+ let code = ch_u32 - 0x10000 ;
665+ let high = 0xD800 + ( code >> 10 ) ;
666+ let low = 0xDC00 + ( code & 0x3FF ) ;
667+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
668+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
669+ }
670+ }
671+
672+ let err_start = ctx. position ( ) + i;
673+ let err_end = StrSize { bytes : i. bytes + ch. len_wtf8 ( ) , chars : i. chars + 1 } ;
674+ let err_end = ctx. position ( ) + err_end;
675+ let replace = ctx. handle_error ( errors, err_start..err_end, Some ( "surrogates not allowed" ) ) ?;
676+ match replace {
677+ EncodeReplace :: Str ( s) => {
678+ // Re-encode the replacement string
679+ for cp in s. as_ref ( ) . code_points ( ) {
680+ let cp_u32 = cp. to_u32 ( ) ;
681+ if cp_u32 <= 0xFFFF {
682+ out. extend_from_slice ( & ( cp_u32 as u16 ) . to_le_bytes ( ) ) ;
683+ } else if cp_u32 <= 0x10FFFF {
684+ let code = cp_u32 - 0x10000 ;
685+ let high = 0xD800 + ( code >> 10 ) ;
686+ let low = 0xDC00 + ( code & 0x3FF ) ;
687+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
688+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
689+ }
690+ }
691+ }
692+ EncodeReplace :: Bytes ( b) => {
693+ out. extend_from_slice ( b. as_ref ( ) ) ;
694+ }
695+ }
696+ }
697+
698+ // Process all remaining data
699+ for ch in ctx. remaining_data ( ) . code_points ( ) {
700+ let ch_u32 = ch. to_u32 ( ) ;
701+ if ch_u32 <= 0xFFFF {
702+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
703+ } else if ch_u32 <= 0x10FFFF {
704+ let code = ch_u32 - 0x10000 ;
705+ let high = 0xD800 + ( code >> 10 ) ;
706+ let low = 0xDC00 + ( code & 0x3FF ) ;
707+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
708+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
709+ }
710+ }
711+ Ok ( out)
712+ }
713+
714+ pub fn decode < Ctx : DecodeContext , E : DecodeErrorHandler < Ctx > > (
715+ mut ctx : Ctx ,
716+ errors : & E ,
717+ final_decode : bool ,
718+ ) -> Result < ( Wtf8Buf , usize ) , Ctx :: Error > {
719+ let mut out = Wtf8Buf :: new ( ) ;
720+
721+ while ctx. remaining_data ( ) . len ( ) >= 2 {
722+ let data = ctx. remaining_data ( ) ;
723+ let ch = u16:: from_le_bytes ( [ data[ 0 ] , data[ 1 ] ] ) ;
724+
725+ if ch < 0xD800 || ch > 0xDFFF {
726+ // BMP character
727+ if let Some ( c) = char:: from_u32 ( ch as u32 ) {
728+ out. push_str ( & c. to_string ( ) ) ;
729+ ctx. advance ( 2 ) ;
730+ } else {
731+ let pos = ctx. position ( ) ;
732+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "invalid character" ) ) ?;
733+ out. push_wtf8 ( replace. as_ref ( ) ) ;
734+ // Don't advance here, the error handler already positioned us
735+ }
736+ } else if ch >= 0xD800 && ch <= 0xDBFF {
737+ // High surrogate
738+ if data. len ( ) < 4 {
739+ if final_decode {
740+ let pos = ctx. position ( ) ;
741+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "unexpected end of data" ) ) ?;
742+ out. push_wtf8 ( replace. as_ref ( ) ) ;
743+ // Don't advance here, the error handler already positioned us
744+ } else {
745+ // In partial mode, stop here and return what we have
746+ break ;
747+ }
748+ } else {
749+ let ch2 = u16:: from_le_bytes ( [ data[ 2 ] , data[ 3 ] ] ) ;
750+ if ch2 >= 0xDC00 && ch2 <= 0xDFFF {
751+ // Valid surrogate pair
752+ let code = ( ( ( ch & 0x3FF ) as u32 ) << 10 ) | ( ( ch2 & 0x3FF ) as u32 ) ;
753+ let code_point = code + 0x10000 ;
754+ if let Some ( c) = char:: from_u32 ( code_point) {
755+ out. push_str ( & c. to_string ( ) ) ;
756+ ctx. advance ( 4 ) ;
757+ } else {
758+ let pos = ctx. position ( ) ;
759+ let replace = ctx. handle_error ( errors, pos..pos + 4 , Some ( "invalid surrogate pair" ) ) ?;
760+ out. push_wtf8 ( replace. as_ref ( ) ) ;
761+ // Don't advance here, the error handler already positioned us
762+ }
763+ } else {
764+ // Invalid surrogate pair
765+ let pos = ctx. position ( ) ;
766+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "illegal UTF-16 surrogate" ) ) ?;
767+ out. push_wtf8 ( replace. as_ref ( ) ) ;
768+ // Don't advance here, the error handler already positioned us
769+ }
770+ }
771+ } else {
772+ // Low surrogate without high surrogate
773+ let pos = ctx. position ( ) ;
774+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "illegal UTF-16 surrogate" ) ) ?;
775+ out. push_wtf8 ( replace. as_ref ( ) ) ;
776+ // Don't advance here, the error handler already positioned us
777+ }
778+ }
779+
780+ // Handle remaining single byte
781+ if ctx. remaining_data ( ) . len ( ) == 1 {
782+ if final_decode {
783+ let pos = ctx. position ( ) ;
784+ let replace = ctx. handle_error ( errors, pos..pos + 1 , Some ( "truncated data" ) ) ?;
785+ out. push_wtf8 ( replace. as_ref ( ) ) ;
786+ // Don't advance here, the error handler already positioned us
787+ }
788+ // In partial mode, just leave it for next call
789+ }
790+
791+ Ok ( ( out, ctx. position ( ) ) )
792+ }
793+ }
0 commit comments