(*
 ZipReader.sml
 K.Mackenzie,  May 2004
*)

(* Attempt at semi-efficient jar file parser. *)
(* We have to use Nonstdio to get random file access *)
(* See end of file for Zip format *)

exception ZipReaderError of string

val archiveName = ref ""
fun setArchName s = archiveName := s
fun error s = raise ZipReaderError ("[" ^ (!archiveName) ^ "]: " ^ s)
(* Not too elegant,  but saves us from having to thread archive name through all of code *)

type entry_info = {
     offset:  int,    (* offset of file data from start of archive *)
     csize:   int,    (* size of compressed data *)
     usize:   int,    (* size of uncompressed data *)
     cmethod: int     (* compression method *)
}

type zipfile = {
     archive_name: string,
     is_open: bool ref,
     is : BasicIO.instream,
     files : (string, entry_info) Polyhash.hash_table
}

fun input_byte is = Char.ord (Nonstdio.input_char is)

fun getBytes is n =
    let fun g 0 acc = rev acc
	  | g n acc = g (n-1) (input_byte is :: acc)
    in
	g n []
    end

fun seek is n = Nonstdio.seek_in is n
fun skip is n = Nonstdio.seek_in is ((Nonstdio.pos_in is) + n)
fun getPos is = Nonstdio.pos_in is

fun getWord8Vector is n =
let val b = getBytes is n handle Size => error "[internal: exn: Size]"
    val w = map Word8.fromInt b
in
    Word8Vector.fromList w
end

fun getString is n =
    let fun g 0 acc = String.implode (rev acc)
	  | g r acc = g (r-1) ((Nonstdio.input_char is)::acc)
    in
	g n [] handle Size => error "string overflow"
    end

fun toHex n = StringCvt.padLeft #"0" 2 (Int.fmt StringCvt.HEX n)

local
    fun hexListToInt l = (* bytes in reverse order *)
    let fun p [] = 0
	  | p (h::t) = h + 256* (p t)
    in
	p l handle Overflow =>
		   error ("overflow: number exceeds Moscow ML's 31-bit limit")
    end
in
    fun getInt4 is = hexListToInt (getBytes is 4)
end

fun getInt2 is =
    let
	val b1 = input_byte is
	val b2 = input_byte is
    in
	b1 + 256*b2
    end


datatype section =
	 LOCAL
       | EXTENDED_LOCAL
       | CENTRAL_DIRECTORY
       | END
       | UNKNOWN of int list


val hdrHdr = 0x50

fun getHeader is =
    case getBytes is 4 of
	[0x50, 0x4b, 0x01, 0x02] => CENTRAL_DIRECTORY
      | [0x50, 0x4b, 0x03, 0x04] => LOCAL
      | [0x50, 0x4b, 0x05, 0x06] => END
      | [0x50, 0x4b, 0x07, 0x08] => EXTENDED_LOCAL
      | v => UNKNOWN v


fun uncompress data method usize =
    let fun err s = error ("unsupported compression method [" ^ s ^ "]")
    in
    case method of
          0 => data
	| 1 => err "The file is Shrunk"
	| 2 => err "The file is Reduced with compression factor 1"
        | 3 => err "The file is Reduced with compression factor 2"
        | 4 => err "The file is Reduced with compression factor 3"
        | 5 => err "The file is Reduced with compression factor 4"
        | 6 => err "The file is Imploded"
        | 7 => err "Reserved for Tokenizing compression algorithm"
        | 8 => Inflate.inflate data usize
	| _ => err ("Unknown type " ^ Int.toString method)
    end


fun get_data is {offset, csize, usize, cmethod} =
    let
	val () = seek is offset (* -> start of local directory for file *)
	val () = case
	    getHeader is of
	    LOCAL => ()
	  | _ => error "bad local file header"

	val () = skip is 22   (* skip lots of stuff, including the local copies of the
			         compressed and uncompressed sizes,  which sometimes
                                 appear to be untrustworthy. *)

	val flen = getInt2 is (* length of filename *)
	val elen = getInt2 is (* length of extra data *)
	val () = skip is (flen+elen) (* filename (which we already know), then extra data *)
	val data = getWord8Vector is csize
    in
	uncompress data cmethod usize handle Subscript => error "[internal: Subscript]"
    end


fun seekCentralDirectory is =
    let
	fun skipBack is n = skip is (~n)
	    handle SysErr _ =>
		   error "couldn't find central directory"

	fun seekHeader is = (* Only called if archive contains final comment (or is corrupt) *)
	    if input_byte is = hdrHdr then
		case (skipBack is 1; getHeader is) of
		    END => ()
		  | _ => (skipBack is 5; seekHeader is)
	    else (skipBack is 2; seekHeader is)

	val () =
	    case getHeader is of
		UNKNOWN v => error "bad zip header"
	      | _ =>
		let
		    val filesize = Nonstdio.in_stream_length is
		    val () = seek is (filesize-22)
		in
		    case getHeader is of
			END => ()
		      | _ => seekHeader is (* zipfile may contain a comment, putting header in wrong place *)
		end
		handle Size => error "size error in readZipFileDir"

        (* Now we should be at the start of the end-of-central-directory record *)

	val diskn     = getInt2 is
	val () = if diskn = 0 then ()
		 else error ("archive appears to contain " ^ Int.toString (diskn+1) ^ " disks")
	val cdiskn    = getInt2 is
	val () = if cdiskn = 0 then ()
		 else error ("central directory appears to be on disk number" ^ Int.toString diskn)
	val n1        = getInt2 is    (* Number of central directory entries on this disk *)
	val cdirsize  = getInt2 is    (* Total number of central directory entries *)
	val () = if n1 = cdirsize then ()
		 else error "central directory on multiple disks???"
	val ()        = skip is 4     (* Physical size of central directory *)
	val cdirstart = getInt4 is
    in
	seek is cdirstart (* Now we're at the start of the central directory *)
    end

    fun nextCdirEntry is =
	case getHeader is of
	    CENTRAL_DIRECTORY =>
	    let
		val () = skip is 6
		val cmethod = getInt2 is
		val () = skip is 8
		val csize = getInt4 is (* Compressed size *)
		val usize = getInt4 is (* Uncompressed size *)
		val fname_len = getInt2 is
		val elen = getInt2 is (* length of extra field *)
		val clen = getInt2 is (* length of file comment *)
		val () = skip is 8
		val offset = getInt4 is (* offset of local hdr from start of file *)
		val fname = getString is fname_len
		val () = skip is (elen+clen)
	    in
		SOME (fname, {offset=offset, csize=csize, usize=usize, cmethod=cmethod})
	    end
	  | _ => NONE (* At the end of the central directory *)
                (* We trust that the cdir has the correct structure. *)
                (* We could do it more safely by iterating through the cdir the appropriate number
                  (ie, cdirsize) of times,  checking for a cdir header each time. *)

fun ensureOpen (z: zipfile) =
    let
	val () = setArchName (#archive_name z)
    in
	if
	    (!(#is_open z))
	then ()
	else error "attempting to access closed zipfile"
    end

local
    fun cache_cdir_entries is htable =
	case nextCdirEntry is of
	    NONE => ()
	  | SOME (fname, info) =>
	    ( Polyhash.insert htable (fname, info);
	      cache_cdir_entries is htable )
in

fun open_in s =
    let
	val () = setArchName s
	val () = if Nonstdio.file_exists s then ()
		 else error "can't find zipfile"
	val is = Nonstdio.open_in_bin s
	val files = Polyhash.mkTable (Polyhash.hash,op=) (200, ZipReaderError "unknown member")
	val () = seekCentralDirectory is
	val () = cache_cdir_entries is files
    in
	{is = is, files = files, archive_name = s, is_open = ref true}
    end
end


fun inputMember (z: zipfile, fname: string) =
    (ensureOpen z;
     case Polyhash.peek (#files z) fname of
	 NONE => NONE
       | SOME info => SOME (get_data (#is z) info)
    )

fun close_in (z: zipfile) = (ensureOpen z; BasicIO.close_in (#is z); #is_open z := false)

fun apply f (z: zipfile) = (ensureOpen z; Polyhash.apply (fn (p,_) => f p) (#files z))
fun members (z: zipfile) = (ensureOpen z; map #1 (Polyhash.listItems (#files z)))


(* Extract a single member in one go *)
local
fun scan_for_entry is entry_name zipname =
    let fun scan () =
	    case nextCdirEntry is of
		NONE => error ("unknown zip member [" ^ entry_name ^ "]")
	      | SOME (fname, info) =>
		if fname = entry_name then get_data is info
		else scan ()
    in
	scan ()
    end
in
fun extractFile (zipname: string, fname: string) =
    let
	val () = setArchName zipname
	val () = if Nonstdio.file_exists zipname then ()
		 else error "can't find zipfile"
	val is = Nonstdio.open_in_bin zipname
	val () = seekCentralDirectory is
	val data = scan_for_entry is fname zipname
	val () = BasicIO.close_in is
    in
	data
    end
end



(* -------------------------------- Zip format -------------------------------- *)
(*

ZIP format

Byte order: Little-endian

Overall zipfile format:
[Local file header + Compressed data [+ Extended local header]?]*
[Central directory]*
[End of central directory record]

*Local file header:*
Offset   Length   Contents
  0      4 bytes  Local file header signature (0x04034b50)
  4      2 bytes  Version needed to extract
  6      2 bytes  General purpose bit flag
  8      2 bytes  Compression method
 10      2 bytes  Last mod file time
 12      2 bytes  Last mod file date
 14      4 bytes  CRC-32
 18      4 bytes  Compressed size (n)
 22      4 bytes  Uncompressed size
 26      2 bytes  Filename length (f)
 28      2 bytes  Extra field length (e)
        (f)bytes  Filename
        (e)bytes  Extra field
        (n)bytes  Compressed data

*Extended local header:*
Offset   Length   Contents
  0      4 bytes  Extended Local file header signature (0x08074b50)
  4      4 bytes  CRC-32
  8      4 bytes  Compressed size
 12      4 bytes  Uncompressed size

*Central directory:*
Offset   Length   Contents
  0      4 bytes  Central file header signature (0x02014b50)
  4      2 bytes  Version made by
  6      2 bytes  Version needed to extract
  8      2 bytes  General purpose bit flag
 10      2 bytes  Compression method
 12      2 bytes  Last mod file time
 14      2 bytes  Last mod file date
 16      4 bytes  CRC-32
 20      4 bytes  Compressed size
 24      4 bytes  Uncompressed size
 28      2 bytes  Filename length (f)
 30      2 bytes  Extra field length (e)
 32      2 bytes  File comment length (c)
 34      2 bytes  Disk number start
 36      2 bytes  Internal file attributes
 38      4 bytes  External file attributes
 42      4 bytes  Relative offset of local header
 46     (f)bytes  Filename
        (e)bytes  Extra field
        (c)bytes  File comment

*End of central directory record:*
Offset   Length   Contents
  0      4 bytes  End of central dir signature (0x06054b50)
  4      2 bytes  Number of this disk
  6      2 bytes  Number of the disk with the start of the central directory
  8      2 bytes  Total number of entries in the central dir on this disk
 10      2 bytes  Total number of entries in the central dir
 12      4 bytes  Size of the central directory
 16      4 bytes  Offset of start of central directory with respect to the starting disk number
 20      2 bytes  zipfile comment length (c)
 22     (c)bytes  zipfile comment




compression method: (2 bytes)
          0 - The file is stored (no compression)
          1 - The file is Shrunk
          2 - The file is Reduced with compression factor 1
          3 - The file is Reduced with compression factor 2
          4 - The file is Reduced with compression factor 3
          5 - The file is Reduced with compression factor 4
          6 - The file is Imploded
          7 - Reserved for Tokenizing compression algorithm
          8 - The file is Deflated

*)

(* Problem:  some of Sun's jar files seem not to have correct data size information
   in the local header (see kvem.jar in the WTK, where the local versions of csize and
   usize are zero).  The info in the central directory appears to be correct in these
   cases. Can we be sure that it'll never happen the other way round? *)
