# (Be in -*- python -*- mode.) # # ==================================================================== # Copyright (c) 2000-2006 CollabNet. All rights reserved. # # This software is licensed as described in the file COPYING, which # you should have received as part of this distribution. The terms # are also available at http://subversion.tigris.org/license-1.html. # If newer versions of this license are posted there, you may use a # newer version instead, at your option. # # This software consists of voluntary contributions made by many # individuals. For exact contribution history, see the revision # history and logs, available at http://cvs2svn.tigris.org/. # ==================================================================== """This module contains database facilities used by cvs2svn.""" import os import md5 from cvs2svn_lib.boolean import * from cvs2svn_lib import config from cvs2svn_lib.common import CommandError from cvs2svn_lib.common import FatalError from cvs2svn_lib.common import OP_ADD from cvs2svn_lib.common import OP_CHANGE from cvs2svn_lib.common import to_utf8 from cvs2svn_lib.svn_repository_mirror import SVNRepositoryMirrorDelegate class DumpfileDelegate(SVNRepositoryMirrorDelegate): """Create a Subversion dumpfile.""" def __init__(self, dumpfile_path): """Return a new DumpfileDelegate instance, attached to a dumpfile DUMPFILE_PATH, using to_utf8().""" self.dumpfile_path = dumpfile_path self.dumpfile = open(self.dumpfile_path, 'wb') self._write_dumpfile_header(self.dumpfile) def _write_dumpfile_header(self, dumpfile): # Initialize the dumpfile with the standard headers. # # Since the CVS repository doesn't have a UUID, and the Subversion # repository will be created with one anyway, we don't specify a # UUID in the dumpflie dumpfile.write('SVN-fs-dump-format-version: 2\n\n') def _utf8_path(self, path): """Return a copy of PATH encoded in UTF-8.""" pieces = path.split('/') # Convert each path component separately (as they may each use # different encodings). for i in range(len(pieces)): try: # Log messages can be converted with the 'replace' strategy, # but we can't afford any lossiness here. pieces[i] = to_utf8(pieces[i], strict=True) except UnicodeError: raise FatalError( "Unable to convert a path '%s' to internal encoding.\n" "Consider rerunning with one or more '--encoding' parameters." % (path,)) return '/'.join(pieces) def _string_for_prop(self, name, value): """Return a property in the form needed for the dumpfile.""" return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value) def start_commit(self, revnum, revprops): """Emit the start of SVN_COMMIT (an SVNCommit).""" self.revision = revnum # The start of a new commit typically looks like this: # # Revision-number: 1 # Prop-content-length: 129 # Content-length: 129 # # K 7 # svn:log # V 27 # Log message for revision 1. # K 10 # svn:author # V 7 # jrandom # K 8 # svn:date # V 27 # 2003-04-22T22:57:58.132837Z # PROPS-END # # Notice that the length headers count everything -- not just the # length of the data but also the lengths of the lengths, including # the 'K ' or 'V ' prefixes. # # The reason there are both Prop-content-length and Content-length # is that the former includes just props, while the latter includes # everything. That's the generic header form for any entity in a # dumpfile. But since revisions only have props, the two lengths # are always the same for revisions. # Calculate the output needed for the property definitions. prop_names = revprops.keys() prop_names.sort() prop_strings = [] for propname in prop_names: if revprops[propname] is not None: prop_strings.append( self._string_for_prop(propname, revprops[propname])) all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n' total_len = len(all_prop_strings) # Print the revision header and revprops self.dumpfile.write('Revision-number: %d\n' 'Prop-content-length: %d\n' 'Content-length: %d\n' '\n' % (self.revision, total_len, total_len)) self.dumpfile.write(all_prop_strings) self.dumpfile.write('\n') def end_commit(self): pass def mkdir(self, path): """Emit the creation of directory PATH.""" self.dumpfile.write("Node-path: %s\n" "Node-kind: dir\n" "Node-action: add\n" "\n" "\n" % self._utf8_path(path)) def _add_or_change_path(self, s_item, op): """Emit the addition or change corresponding to S_ITEM. OP is either the constant OP_ADD or OP_CHANGE.""" # Validation stuffs if op == OP_ADD: action = 'add' elif op == OP_CHANGE: action = 'change' else: raise FatalError("_add_or_change_path() called with bad op ('%s')" % (op,)) # Convenience variables cvs_rev = s_item.cvs_rev # The property handling here takes advantage of an undocumented # but IMHO consistent feature of the Subversion dumpfile-loading # code. When a node's properties aren't mentioned (that is, the # "Prop-content-length:" header is absent, no properties are # listed at all, and there is no "PROPS-END\n" line) then no # change is made to the node's properties. # # This is consistent with the way dumpfiles behave w.r.t. text # content changes, so I'm comfortable relying on it. If you # commit a change to *just* the properties of some node that # already has text contents from a previous revision, then in the # dumpfile output for the prop change, no "Text-content-length:" # nor "Text-content-md5:" header will be present, and the text of # the file will not be given. But this does not cause the file's # text to be erased! It simply remains unchanged. # # This works out great for cvs2svn, due to lucky coincidences: # # For files, the only properties we ever set are set in the first # revision; all other revisions (including on branches) inherit # from that. After the first revision, we never change file # properties, therefore, there is no need to remember the full set # of properties on a given file once we've set it. # # For directories, the only property we set is "svn:ignore", and # while we may change it after the first revision, we always do so # based on the contents of a ".cvsignore" file -- in other words, # CVS is doing the remembering for us, so we still don't have to # preserve the previous value of the property ourselves. # Calculate the (sorted-by-name) property string and length, if any. if s_item.svn_props_changed: svn_props = s_item.svn_props prop_contents = '' prop_names = svn_props.keys() prop_names.sort() for pname in prop_names: pvalue = svn_props[pname] if pvalue is not None: prop_contents += self._string_for_prop(pname, pvalue) prop_contents += 'PROPS-END\n' props_header = 'Prop-content-length: %d\n' % len(prop_contents) else: prop_contents = '' props_header = '' # treat .cvsignore as a directory property dir_path, basename = os.path.split(cvs_rev.svn_path) if basename == ".cvsignore": ignore_vals = generate_ignores(cvs_rev) ignore_contents = '\n'.join(ignore_vals) if ignore_contents: ignore_contents += '\n' ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \ (len(ignore_contents), ignore_contents)) ignore_contents += 'PROPS-END\n' ignore_len = len(ignore_contents) # write headers, then props self.dumpfile.write('Node-path: %s\n' 'Node-kind: dir\n' 'Node-action: change\n' 'Prop-content-length: %d\n' 'Content-length: %d\n' '\n' '%s' % (self._utf8_path(dir_path), ignore_len, ignore_len, ignore_contents)) # If the file has keywords, we must prevent CVS/RCS from expanding # the keywords because they must be unexpanded in the repository, # or Subversion will get confused. pipe_cmd, pipe = cvs_rev.cvs_file.project.cvs_repository.get_co_pipe( cvs_rev, suppress_keyword_substitution=s_item.has_keywords) self.dumpfile.write('Node-path: %s\n' 'Node-kind: file\n' 'Node-action: %s\n' '%s' # no property header if no props 'Text-content-length: ' % (self._utf8_path(cvs_rev.svn_path), action, props_header)) pos = self.dumpfile.tell() self.dumpfile.write('0000000000000000\n' 'Text-content-md5: 00000000000000000000000000000000\n' 'Content-length: 0000000000000000\n' '\n') if prop_contents: self.dumpfile.write(prop_contents) # Insert a filter to convert all EOLs to LFs if neccessary if s_item.needs_eol_filter: data_reader = LF_EOL_Filter(pipe.stdout) else: data_reader = pipe.stdout # Insert the rev contents, calculating length and checksum as we go. checksum = md5.new() length = 0 while True: buf = data_reader.read(config.PIPE_READ_SIZE) if buf == '': break checksum.update(buf) length += len(buf) self.dumpfile.write(buf) pipe.stdout.close() error_output = pipe.stderr.read() exit_status = pipe.wait() if exit_status: raise CommandError(pipe_cmd, exit_status, error_output) # Go back to patch up the length and checksum headers: self.dumpfile.seek(pos, 0) # We left 16 zeros for the text length; replace them with the real # length, padded on the left with spaces: self.dumpfile.write('%16d' % length) # 16... + 1 newline + len('Text-content-md5: ') == 35 self.dumpfile.seek(pos + 35, 0) self.dumpfile.write(checksum.hexdigest()) # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84 self.dumpfile.seek(pos + 84, 0) # The content length is the length of property data, text data, # and any metadata around/inside around them. self.dumpfile.write('%16d' % (length + len(prop_contents))) # Jump back to the end of the stream self.dumpfile.seek(0, 2) # This record is done (write two newlines -- one to terminate # contents that weren't themselves newline-termination, one to # provide a blank line for readability. self.dumpfile.write('\n\n') def add_path(self, s_item): """Emit the addition corresponding to S_ITEM, an SVNCommitItem.""" self._add_or_change_path(s_item, OP_ADD) def change_path(self, s_item): """Emit the change corresponding to S_ITEM, an SVNCommitItem.""" self._add_or_change_path(s_item, OP_CHANGE) def delete_path(self, path): """Emit the deletion of PATH.""" self.dumpfile.write('Node-path: %s\n' 'Node-action: delete\n' '\n' % self._utf8_path(path)) def copy_path(self, src_path, dest_path, src_revnum): """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH.""" # We don't need to include "Node-kind:" for copies; the loader # ignores it anyway and just uses the source kind instead. self.dumpfile.write('Node-path: %s\n' 'Node-action: add\n' 'Node-copyfrom-rev: %d\n' 'Node-copyfrom-path: /%s\n' '\n' % (self._utf8_path(dest_path), src_revnum, self._utf8_path(src_path))) def finish(self): """Perform any cleanup necessary after all revisions have been committed.""" self.dumpfile.close() def generate_ignores(cvs_rev): # Read in props pipe_cmd, pipe = \ cvs_rev.cvs_file.project.cvs_repository.get_co_pipe(cvs_rev) buf = pipe.stdout.read(config.PIPE_READ_SIZE) raw_ignore_val = "" while buf: raw_ignore_val += buf buf = pipe.stdout.read(config.PIPE_READ_SIZE) pipe.stdout.close() error_output = pipe.stderr.read() exit_status = pipe.wait() if exit_status: raise CommandError(pipe_cmd, exit_status, error_output) # Tweak props: First, convert any spaces to newlines... raw_ignore_val = '\n'.join(raw_ignore_val.split()) raw_ignores = raw_ignore_val.split('\n') ignore_vals = [ ] for ignore in raw_ignores: # Reset the list if we encounter a '!' # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore if ignore == '!': ignore_vals = [ ] continue # Skip empty lines if len(ignore) == 0: continue ignore_vals.append(ignore) return ignore_vals class LF_EOL_Filter: """Filter a stream and convert all end-of-line markers (CRLF, CR or LF) into LFs only.""" def __init__(self, stream): self.stream = stream self.carry_cr = False self.eof = False def read(self, size): while True: buf = self.stream.read(size) self.eof = len(buf) == 0 if self.carry_cr: buf = '\r' + buf self.carry_cr = False if not self.eof and buf[-1] == '\r': self.carry_cr = True buf = buf[:-1] buf = buf.replace('\r\n', '\n') buf = buf.replace('\r', '\n') if buf or self.eof: return buf