#!/usr/bin/env python -u
# (Be in -*- python -*- mode.)
#
# ====================================================================
# Copyright (c) 2009-2010 CollabNet.  All rights reserved.
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution.  The terms
# are also available at http://subversion.tigris.org/license-1.html.
# If newer versions of this license are posted there, you may use a
# newer version instead, at your option.
#
# This software consists of voluntary contributions made by many
# individuals.  For exact contribution history, see the revision
# history and logs, available at http://cvs2svn.tigris.org/.
# ====================================================================

"""Generate git blobs directly from RCS files.

Usage: generate_blobs.py BLOBFILE

To standard input should be written a series of pickles, each of which
contains the following tuple:

(RCSFILE, {CVS_REV : MARK, ...})

indicating which RCS file to read, which CVS revisions should be
written to the blob file, and which marks to give each of the blobs.

Since the tuples are read from stdin, either the calling program has
to write to this program's stdin in binary mode and ensure that this
program's standard input is opened in binary mode (e.g., using
Python's '-u' option) or both can be in text mode *provided* that
pickle protocol 0 is used.

The program does most of its work in RAM, keeping at most one revision
fulltext and one revision deltatext (plus perhaps one or two copies as
scratch space) in memory at a time.  But there are times when the
fulltext of a revision is needed multiple times, for example when
multiple branches sprout from the revision.  In these cases, the
fulltext is written to disk.  If the fulltext is also needed for the
blobfile, then the copy in the blobfils is read again when it is
needed.  If the fulltext is not needed in the blobfile, then it is
written to a temporary file created with Python's tempfile module."""

import sys
import os
import tempfile
import cPickle as pickle

sys.path.insert(0, os.path.dirname(os.path.dirname(sys.argv[0])))

from cvs2svn_lib.rcsparser import Sink
from cvs2svn_lib.rcsparser import parse
from cvs2svn_lib.rcs_stream import RCSStream


def read_marks():
  # A map from CVS revision number (e.g., 1.2.3.4) to mark:
  marks = {}
  for l in sys.stdin:
    [rev, mark] = l.strip().split()
    marks[rev] = mark

  return marks


class RevRecord(object):
  def __init__(self, rev, mark=None):
    self.rev = rev
    self.mark = mark

    # The rev whose fulltext is the base for this one's delta.
    self.base = None

    # Other revs that refer to this one as their base text:
    self.refs = set()

    # The (f, offset, length) where the fulltext of this revision can
    # be found:
    self.fulltext = None

  def is_needed(self):
    return bool(self.mark is not None or self.refs)

  def is_written(self):
    return self.fulltext is not None

  def write_blob(self, f, text):
    f.seek(0, 2)
    length = len(text)
    f.write('blob\n')
    f.write('mark :%s\n' % (self.mark,))
    f.write('data %d\n' % (length,))
    offset = f.tell()
    f.write(text)
    f.write('\n')

    self.fulltext = (f, offset, length)

    # This record (with its mark) has now been written, so the mark is
    # no longer needed.  Setting it to None might allow is_needed() to
    # become False:
    self.mark = None

  def write(self, f, text):
    f.seek(0, 2)
    offset = f.tell()
    length = len(text)
    f.write(text)
    self.fulltext = (f, offset, length)

  def read_fulltext(self):
    assert self.fulltext is not None
    (f, offset, length) = self.fulltext
    f.seek(offset)
    return f.read(length)

  def __str__(self):
    if self.mark is not None:
      return '%s (%r): %r, %s' % (
          self.rev, self.mark, self.refs, self.fulltext is not None,
          )
    else:
      return '%s: %r, %s' % (self.rev, self.refs, self.fulltext is not None)


class WriteBlobSink(Sink):
  def __init__(self, blobfile, marks):
    self.blobfile = blobfile

    # A map {rev : RevRecord} for all of the revisions whose fulltext
    # will still be needed:
    self.revrecs = {}

    # The revisions that need marks will definitely be needed, so
    # create records for them now (the rest will be filled in while
    # reading the RCS file):
    for (rev, mark) in marks.items():
      self.revrecs[rev] = RevRecord(rev, mark)

    # The RevRecord of the last fulltext that has been reconstructed,
    # if it still is_needed():
    self.last_revrec = None
    # An RCSStream holding the fulltext of last_revrec:
    self.last_rcsstream = None

    # A file to temporarily hold the fulltexts of revisions for which
    # no blobs are needed:
    self.fulltext_file = tempfile.TemporaryFile()

  def __getitem__(self, rev):
    try:
      return self.revrecs[rev]
    except KeyError:
      revrec = RevRecord(rev)
      self.revrecs[rev] = revrec
      return revrec

  def define_revision(self, rev, timestamp, author, state, branches, next):
    revrec = self[rev]

    if next is not None:
      revrec.refs.add(next)

    revrec.refs.update(branches)

    for dependent_rev in revrec.refs:
      dependent_revrec = self[dependent_rev]
      assert dependent_revrec.base is None
      dependent_revrec.base = rev

  def tree_completed(self):
    """Remove unneeded RevRecords.

    Remove the RevRecords for any revisions whose fulltext will not be
    needed (neither as blob output nor as the base of another needed
    revision)."""

    revrecs_to_remove = [
        revrec
        for revrec in self.revrecs.itervalues()
        if not revrec.is_needed()
        ]
    while revrecs_to_remove:
      revrec = revrecs_to_remove.pop()
      del self.revrecs[revrec.rev]
      base_revrec = self[revrec.base]
      base_revrec.refs.remove(revrec.rev)
      if not base_revrec.is_needed():
        revrecs_to_remove.append(base_revrec)

  def set_revision_info(self, rev, log, text):
    revrec = self.revrecs.get(rev)

    if revrec is None:
      return

    base_rev = revrec.base
    if base_rev is None:
      # This must be the last revision on trunk, for which the
      # fulltext is stored directly in the RCS file:
      assert self.last_revrec is None
      if revrec.mark is not None:
        revrec.write_blob(self.blobfile, text)
      if revrec.is_needed():
        self.last_revrec = revrec
        self.last_rcsstream = RCSStream(text)
    elif self.last_revrec is not None and base_rev == self.last_revrec.rev:
      # Our base revision is stored in self.last_rcsstream.
      self.last_revrec.refs.remove(rev)
      if self.last_revrec.is_needed():
        if not self.last_revrec.is_written():
          self.last_revrec.write(
              self.fulltext_file, self.last_rcsstream.get_text()
              )
      self.last_rcsstream.apply_diff(text)
      if revrec.mark is not None:
        revrec.write_blob(self.blobfile, self.last_rcsstream.get_text())
      if revrec.is_needed():
        self.last_revrec = revrec
      else:
        self.last_revrec = None
        self.last_rcsstream = None
    else:
      # Our base revision is not stored in self.last_rcsstream; it
      # will have to be obtained from elsewhere.

      # Store the old last_rcsstream if necessary:
      if self.last_revrec is not None:
        if not self.last_revrec.is_written():
          self.last_revrec.write(
              self.fulltext_file, self.last_rcsstream.get_text()
              )
        self.last_revrec = None
        self.last_rcsstream = None

      base_revrec = self[base_rev]
      rcsstream = RCSStream(base_revrec.read_fulltext())
      base_revrec.refs.remove(rev)
      rcsstream.apply_diff(text)
      if revrec.mark is not None:
        revrec.write_blob(self.blobfile, rcsstream.get_text())
      if revrec.is_needed():
        self.last_revrec = revrec
        self.last_rcsstream = rcsstream
      del rcsstream

  def parse_completed(self):
    self.fulltext_file.close()


def main(args):
  [blobfilename] = args
  blobfile = open(blobfilename, 'w+b')
  while True:
    try:
      (rcsfile, marks) = pickle.load(sys.stdin)
    except EOFError:
      break
    parse(open(rcsfile, 'rb'), WriteBlobSink(blobfile, marks))

  blobfile.close()


if __name__ == '__main__':
  main(sys.argv[1:])