# (Be in -*- python -*- mode.) # # ==================================================================== # Copyright (c) 2000-2006 CollabNet. All rights reserved. # # This software is licensed as described in the file COPYING, which # you should have received as part of this distribution. The terms # are also available at http://subversion.tigris.org/license-1.html. # If newer versions of this license are posted there, you may use a # newer version instead, at your option. # # This software consists of voluntary contributions made by many # individuals. For exact contribution history, see the revision # history and logs, available at http://cvs2svn.tigris.org/. # ==================================================================== """This module contains database facilities used by cvs2svn.""" from __future__ import generators import sys import os import re import time from cvs2svn_lib.boolean import * from cvs2svn_lib.set_support import * from cvs2svn_lib import config from cvs2svn_lib.common import FatalError from cvs2svn_lib.common import warning_prefix from cvs2svn_lib.common import error_prefix from cvs2svn_lib.common import OP_ADD from cvs2svn_lib.common import OP_CHANGE from cvs2svn_lib.common import OP_DELETE from cvs2svn_lib.log import Log from cvs2svn_lib.context import Ctx from cvs2svn_lib.artifact_manager import artifact_manager from cvs2svn_lib.cvs_file import CVSFile from cvs2svn_lib.line_of_development import Trunk from cvs2svn_lib.line_of_development import Branch from cvs2svn_lib.cvs_item import CVSRevision from cvs2svn_lib.key_generator import KeyGenerator from cvs2svn_lib.database import Database from cvs2svn_lib.database import SDatabase from cvs2svn_lib.database import DB_OPEN_NEW from cvs2svn_lib.cvs_file_database import CVSFileDatabase from cvs2svn_lib.cvs_item_database import NewCVSItemStore from cvs2svn_lib.symbol import Symbol from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector from cvs2svn_lib.metadata_database import MetadataDatabase import cvs2svn_rcsparse branch_tag_re = re.compile(r''' ^ ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot (?:0\.)? # CVS sticks an extra 0 here; RCS does not (\d+) # And the last digit group $ ''', re.VERBOSE) # This really only matches standard '1.1.1.*'-style vendor revisions. # One could conceivably have a file whose default branch is 1.1.3 or # whatever, or was that at some point in time, with vendor revisions # 1.1.3.1, 1.1.3.2, etc. But with the default branch gone now (which # is the only time this regexp gets used), we'd have no basis for # assuming that the non-standard vendor branch had ever been the # default branch anyway, so we don't want this to match them anyway. vendor_revision = re.compile(r'^1\.1\.1\.\d+$') def is_trunk_revision(rev): """Return True iff REV is a trunk revision.""" return rev.count('.') == 1 def is_branch_revision(rev): """Return True iff REV is a branch revision.""" return rev.count('.') >= 3 def is_same_line_of_development(rev1, rev2): """Return True if rev1 and rev2 are on the same line of development (i.e., both on trunk, or both on the same branch); return False otherwise. Either rev1 or rev2 can be None, in which case automatically return False.""" if rev1 is None or rev2 is None: return False if rev1.count('.') == 1 and rev2.count('.') == 1: return True if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]: return True return False class _RevisionData: """We track the state of each revision so that in set_revision_info, we can determine if our op is an add/change/delete. We can do this because in set_revision_info, we'll have all of the _RevisionData for a file at our fingertips, and we need to examine the state of our prev_rev to determine if we're an add or a change. Without the state of the prev_rev, we are unable to distinguish between an add and a change.""" def __init__(self, cvs_rev_id, rev, timestamp, author, state): # The id of this revision: self.cvs_rev_id = cvs_rev_id # The CVSRevision is not yet known. It will be stored here: self.cvs_rev = None self.rev = rev self.timestamp = timestamp self.author = author self.original_timestamp = timestamp self._adjusted = False self.state = state # If this is the first revision on a branch, then this is the # branch_data of that branch; otherwise it is None. self.parent_branch_data = None # The revision number of the parent of this revision along the # same line of development, if any. # # For the first revision R on a branch, we consider the revision # from which R sprouted to be the 'previous'. # # Note that this revision can't be determined arithmetically (due # to cvsadmin -o, which is why this is necessary). # # If the key has no previous revision, then this field is None. self.parent = None # The revision number of the primary child of this revision (the # child along the same line of development), if any; otherwise, # None. self.child = None # The _BranchData instances of branches that sprout from this # revision. It would be inconvenient to initialize it here # because we would have to scan through all branches known by the # _SymbolDataCollector to find the ones having us as the parent. # Instead, this information is filled in by # _FileDataCollector._resolve_dependencies(). self.branches_data = [] # The _SymbolData instances of symbols that are closed by this # revision. self.closed_symbols_data = [] # The _TagData instances of tags that are connected to this # revision. self.tags_data = [] # The id of the metadata record associated with this revision. self.metadata_id = None # A boolean value indicating whether deltatext was associated with # this revision. self.deltatext_exists = None def adjust_timestamp(self, timestamp): self._adjusted = True self.timestamp = timestamp def timestamp_was_adjusted(self): return self._adjusted def is_first_on_branch(self): return not self.parent or self.parent_branch_data is not None class _SymbolData: """Collection area for information about a CVS symbol (branch or tag).""" def __init__(self, id, symbol): self.id = id self.symbol = symbol class _BranchData(_SymbolData): """Collection area for information about a CVSBranch.""" def __init__(self, id, symbol, branch_number): _SymbolData.__init__(self, id, symbol) self.branch_number = branch_number # The revision number of the revision from which this branch # sprouts. self.parent = self.branch_number[:self.branch_number.rindex(".")] # The revision number of the first commit on this branch, if any; # otherwise, None. self.child = None class _TagData(_SymbolData): """Collection area for information about a CVSTag.""" def __init__(self, id, symbol, rev): _SymbolData.__init__(self, id, symbol) self.rev = rev class _SymbolDataCollector: """Collect information about symbols in a CVSFile.""" def __init__(self, fdc, cvs_file): self.fdc = fdc self.cvs_file = cvs_file self.pdc = self.fdc.pdc self.collect_data = self.fdc.collect_data # A set containing the names of each known symbol in this file, # used to check for duplicates. self._known_symbols = set() # Map { branch_number : _BranchData }, where branch_number has an # odd number of digits. self.branches_data = { } # Map { revision : [ tag_data ] }, where revision has an even # number of digits, and the value is a list of _TagData objects # for tags that apply to that revision. self.tags_data = { } def _add_branch(self, name, branch_number): """Record that BRANCH_NUMBER is the branch number for branch NAME, and derive and record the revision from which NAME sprouts. BRANCH_NUMBER is an RCS branch number with an odd number of components, for example '1.7.2' (never '1.7.0.2'). Return the _BranchData instance (which is usually newly-created).""" branch_data = self.branches_data.get(branch_number) if branch_data is not None: sys.stderr.write("%s: in '%s':\n" " branch '%s' already has name '%s',\n" " cannot also have name '%s', ignoring the latter\n" % (warning_prefix, self.cvs_file.filename, branch_number, branch_data.symbol.name, name)) return branch_data symbol = self.pdc.get_symbol(name) self.collect_data.symbol_stats.register_branch_creation(symbol) branch_data = _BranchData( self.collect_data.key_generator.gen_id(), symbol, branch_number) self.branches_data[branch_number] = branch_data return branch_data def _add_unlabeled_branch(self, branch_number): name = "unlabeled-" + branch_number return self._add_branch(name, branch_number) def _add_tag(self, name, revision): """Record that tag NAME refers to the specified REVISION.""" symbol = self.pdc.get_symbol(name) self.collect_data.symbol_stats.register_tag_creation(symbol) tag_data = _TagData( self.collect_data.key_generator.gen_id(), symbol, revision) self.tags_data.setdefault(revision, []).append(tag_data) return tag_data def define_symbol(self, name, revision): """Record a symbol called NAME, which is associated with REVISON. REVISION is an unprocessed revision number from the RCS file's header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'. NAME is an untransformed branch or tag name. This function will determine by inspection whether it is a branch or a tag, and record it in the right places.""" name = self.cvs_file.project.transform_symbol(self.cvs_file, name) # Check that the symbol is not already defined, which can easily # happen when --symbol-transform is used: if name in self._known_symbols: err = "%s: Multiple definitions of the symbol '%s' in '%s'" \ % (error_prefix, name, self.cvs_file.filename) sys.stderr.write(err + "\n") self.collect_data.fatal_errors.append(err) return self._known_symbols.add(name) # Determine whether it is a branch or tag, then add it: m = branch_tag_re.match(revision) if m: self._add_branch(name, m.group(1) + m.group(2)) else: self._add_tag(name, revision) def rev_to_branch_data(self, revision): """Return the branch_data of the branch on which REVISION lies. REVISION is a branch revision number with an even number of components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2'). For the convenience of callers, REVISION can also be a trunk revision such as '1.2', in which case just return None.""" if is_trunk_revision(revision): return None return self.branches_data.get(revision[:revision.rindex(".")]) def register_commit(self, rev_data): """If REV_DATA describes a non-trunk revision number, then record it as a commit on the corresponding branch. This records the commit in symbol_stats, which is used to generate statistics for --force-branch and --force-tag guidance.""" rev = rev_data.rev if is_branch_revision(rev): branch_number = rev[:rev.rindex(".")] branch_data = self.branches_data[branch_number] # Register the commit on this non-trunk branch self.collect_data.symbol_stats.register_branch_commit( branch_data.symbol) def register_branch_blockers(self): for (revision, tag_data_list) in self.tags_data.items(): if is_branch_revision(revision): branch_data_parent = self.rev_to_branch_data(revision) for tag_data in tag_data_list: self.collect_data.symbol_stats.register_branch_blocker( branch_data_parent.symbol, tag_data.symbol) for branch_data_child in self.branches_data.values(): if is_branch_revision(branch_data_child.parent): branch_data_parent = self.rev_to_branch_data(branch_data_child.parent) self.collect_data.symbol_stats.register_branch_blocker( branch_data_parent.symbol, branch_data_child.symbol) class _FileDataCollector(cvs2svn_rcsparse.Sink): """Class responsible for collecting RCS data for a particular file. Any collected data that need to be remembered are stored into the referenced CollectData instance.""" def __init__(self, pdc, cvs_file): """Create an object that is prepared to receive data for CVS_FILE. CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the information collected about the file.""" self.pdc = pdc self.cvs_file = cvs_file self.collect_data = self.pdc.collect_data self.project = self.cvs_file.project # A place to store information about the symbols in this file: self.sdc = _SymbolDataCollector(self, self.cvs_file) # { revision : _RevisionData instance } self._rev_data = { } # A list [ revision ] of the revision numbers seen, in the order # they were given to us by rcsparse: self._rev_order = [] # Lists [ (parent, child) ] of revision number pairs indicating # that revision child depends on revision parent along the main # line of development. self._primary_dependencies = [] # If set, this is an RCS branch number -- rcsparse calls this the # "principal branch", but CVS and RCS refer to it as the "default # branch", so that's what we call it, even though the rcsparse API # setter method is still 'set_principal_branch'. self.default_branch = None # The default RCS branch, if any, for this CVS file. # # The value is None or a vendor branch revision, such as # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'. The vendor branch # revision represents the highest vendor branch revision thought # to have ever been head of the default branch. # # The reason we record a specific vendor revision, rather than a # default branch number, is that there are two cases to handle: # # One case is simple. The RCS file lists a default branch # explicitly in its header, such as '1.1.1'. In this case, we # know that every revision on the vendor branch is to be treated # as head of trunk at that point in time. # # But there's also a degenerate case. The RCS file does not # currently have a default branch, yet we can deduce that for some # period in the past it probably *did* have one. For example, the # file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are # dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated # after 1.2. In this case, we should record 1.1.1.96 as the last # vendor revision to have been the head of the default branch. self.cvs_file_default_branch = None # If the RCS file doesn't have a default branch anymore, but does # have vendor revisions, then we make an educated guess that those # revisions *were* the head of the default branch up until the # commit of 1.2, at which point the file's default branch became # trunk. This records the date at which 1.2 was committed. self.first_non_vendor_revision_date = None # A list of rev_data for each revision, in the order that the # corresponding set_revision_info() callback was called. This # information is collected while the file is being parsed then # processed in _process_revision_data(), which is called by # parse_completed(). self._revision_data = [] def _get_rev_id(self, revision): if revision is None: return None return self._rev_data[revision].cvs_rev_id def set_principal_branch(self, branch): """This is a callback method declared in Sink.""" self.default_branch = branch def set_expansion(self, mode): """This is a callback method declared in Sink.""" self.cvs_file.mode = mode def define_tag(self, name, revision): """Remember the symbol name and revision, but don't process them yet. This is a callback method declared in Sink.""" self.sdc.define_symbol(name, revision) def admin_completed(self): """This is a callback method declared in Sink.""" pass def define_revision(self, revision, timestamp, author, state, branches, next): """This is a callback method declared in Sink.""" for branch in branches: branch_number = branch[:branch.rindex('.')] branch_data = self.sdc.branches_data.get(branch_number) if branch_data is None: # Normally we learn about the branches from the branch names # and numbers parsed from the symbolic name header. But this # must have been an unlabeled branch that slipped through the # net. Generate a name for it and create a _BranchData record # for it now. branch_data = self.sdc._add_unlabeled_branch(branch_number) assert branch_data.child is None branch_data.child = branch # Record basic information about the revision: self._rev_data[revision] = _RevisionData( self.collect_data.key_generator.gen_id(), revision, int(timestamp), author, state) # Remember the order that revisions were defined: self._rev_order.append(revision) # When on trunk, the RCS 'next' revision number points to what # humans might consider to be the 'previous' revision number. For # example, 1.3's RCS 'next' is 1.2. # # However, on a branch, the RCS 'next' revision number really does # point to what humans would consider to be the 'next' revision # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2. # # In other words, in RCS, 'next' always means "where to find the next # deltatext that you need this revision to retrieve. # # That said, we don't *want* RCS's behavior here, so we determine # whether we're on trunk or a branch and set the dependencies # accordingly. if next: if is_trunk_revision(revision): self._primary_dependencies.append( (next, revision,) ) else: self._primary_dependencies.append( (revision, next,) ) def _resolve_dependencies(self): """Store the primary and branch dependencies into the rev_data objects.""" for (parent, child,) in self._primary_dependencies: parent_data = self._rev_data[parent] assert parent_data.child is None parent_data.child = child child_data = self._rev_data[child] assert child_data.parent is None child_data.parent = parent for branch_data in self.sdc.branches_data.values(): # The branch_data's parent has the branch as a child regardless # of whether the branch had any subsequent commits: parent_data = self._rev_data[branch_data.parent] parent_data.branches_data.append(branch_data) if not Ctx().trunk_only and parent_data.child is not None: closing_data = self._rev_data[parent_data.child] closing_data.closed_symbols_data.append(branch_data) # If the branch has a child (i.e., something was committed on # the branch), then we store a reference to the branch_data # there, and also define the child's parent to be the branch's # parent: if branch_data.child is not None: child_data = self._rev_data[branch_data.child] assert child_data.parent_branch_data is None child_data.parent_branch_data = branch_data assert child_data.parent is None child_data.parent = branch_data.parent for tag_data_list in self.sdc.tags_data.values(): for tag_data in tag_data_list: # The tag_data's rev has the tag as a child: parent_data = self._rev_data[tag_data.rev] parent_data.tags_data.append(tag_data) if not Ctx().trunk_only and parent_data.child is not None: closing_data = self._rev_data[parent_data.child] closing_data.closed_symbols_data.append(tag_data) def _update_default_branch(self, rev_data): """Ratchet up the highest vendor head revision based on REV_DATA, if necessary.""" if self.default_branch: default_branch_root = self.default_branch + "." if (rev_data.rev.startswith(default_branch_root) and default_branch_root.count('.') == rev_data.rev.count('.')): # This revision is on the default branch, so record that it is # the new highest default branch head revision. self.cvs_file_default_branch = rev_data.rev else: # No default branch, so make an educated guess. if rev_data.rev == '1.2': # This is probably the time when the file stopped having a # default branch, so make a note of it. self.first_non_vendor_revision_date = rev_data.timestamp else: if vendor_revision.match(rev_data.rev) \ and (not self.first_non_vendor_revision_date or rev_data.timestamp < self.first_non_vendor_revision_date): # We're looking at a vendor revision, and it wasn't # committed after this file lost its default branch, so bump # the maximum trunk vendor revision in the permanent record. self.cvs_file_default_branch = rev_data.rev def _resync_chain(self, rev_data): """If the REV_DATA.parent revision exists and it occurred later than the REV_DATA revision, then shove the previous revision back in time (and any before it that may need to shift). Return True iff any resyncing was done. We sync backwards and not forwards because any given CVS Revision has only one previous revision. However, a CVS Revision can *be* a previous revision for many other revisions (e.g., a revision that is the source of multiple branches). This becomes relevant when we do the secondary synchronization in pass 2--we can make certain that we don't resync a revision earlier than its previous revision, but it would be non-trivial to make sure that we don't resync revision R *after* any revisions that have R as a previous revision.""" resynced = False while rev_data.parent is not None: prev_rev_data = self._rev_data[rev_data.parent] if prev_rev_data.timestamp < rev_data.timestamp: # No resyncing needed here. return resynced old_timestamp = prev_rev_data.timestamp prev_rev_data.adjust_timestamp(rev_data.timestamp - 1) resynced = True delta = prev_rev_data.timestamp - old_timestamp Log().verbose( "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" % (self.cvs_file.cvs_path, prev_rev_data.rev, time.ctime(old_timestamp), delta)) if abs(delta) > config.COMMIT_THRESHOLD: Log().warn( "%s: Significant timestamp change for '%s' (%d seconds)" % (warning_prefix, self.cvs_file.cvs_path, delta)) rev_data = prev_rev_data return resynced def tree_completed(self): """The revision tree has been parsed. Analyze it for consistency. This is a callback method declared in Sink.""" for rev in self._rev_order: rev_data = self._rev_data[rev] self.sdc.register_commit(rev_data) self._update_default_branch(rev_data) self._resolve_dependencies() # Our algorithm depends upon the timestamps on the revisions occuring # monotonically over time. That is, we want to see rev 1.34 occur in # time before rev 1.35. If we inserted 1.35 *first* (due to the time- # sorting), and then tried to insert 1.34, we'd be screwed. # To perform the analysis, we'll simply visit all of the 'previous' # links that we have recorded and validate that the timestamp on the # previous revision is before the specified revision. # If we have to resync some nodes, then we restart the scan. Just # keep looping as long as we need to restart. while True: for rev_data in self._rev_data.values(): if self._resync_chain(rev_data): # Abort for loop, causing the scan to start again: break else: # Finished the for-loop without having to resync anything. # We're done. return def _determine_operation(self, rev_data): # How to tell if a CVSRevision is an add, a change, or a deletion: # # It's a delete if RCS state is 'dead' # # It's an add if RCS state is 'Exp.' and # - we either have no previous revision # or # - we have a previous revision whose state is 'dead' # # Anything else is a change. prev_rev_data = self._rev_data.get(rev_data.parent) if rev_data.state == 'dead': op = OP_DELETE elif prev_rev_data is None or prev_rev_data.state == 'dead': op = OP_ADD else: op = OP_CHANGE # There can be an odd situation where the tip revision of a branch # is alive, but every predecessor on the branch is in state 'dead', # yet the revision from which the branch sprouts is alive. (This # is sort of a mirror image of the more common case of adding a # file on a branch, in which the first revision on the branch is # alive while the revision from which it sprouts is dead.) # # In this odd situation, we must mark the first live revision on # the branch as an OP_CHANGE instead of an OP_ADD, because it # reflects, however indirectly, a change w.r.t. the source # revision from which the branch sprouts. # # This is issue #89. if is_branch_revision(rev_data.rev) and rev_data.state != 'dead': cur_rev_data = rev_data while True: if cur_rev_data.parent is None: break prev_rev_data = self._rev_data[cur_rev_data.parent] if (not is_same_line_of_development(cur_rev_data.rev, prev_rev_data.rev) and cur_rev_data.state == 'dead' and prev_rev_data.state != 'dead'): op = OP_CHANGE cur_rev_data = prev_rev_data return op def set_revision_info(self, revision, log, text): """This is a callback method declared in Sink.""" rev_data = self._rev_data[revision] rev_data.metadata_id = self.collect_data.metadata_db.get_key( self.project, rev_data.author, log) rev_data.deltatext_exists = bool(text) # "...Give back one kadam to honor the Hebrew God whose Ark this is." # -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark' # # If revision 1.1 appears to have been created via 'cvs add' # instead of 'cvs import', then this file probably never had a # default branch, so retroactively remove its record in the # default branches db. The test is that the log message CVS uses # for 1.1 in imports is "Initial revision\n" with no period. if revision == '1.1' and log != 'Initial revision\n': self.cvs_file_default_branch = None self._revision_data.append(rev_data) def _is_default_branch_revision(self, rev_data): """Return True iff REV_DATA.rev is a default branch revision.""" val = self.cvs_file_default_branch if val is not None: val_last_dot = val.rindex(".") our_last_dot = rev_data.rev.rindex(".") default_branch = val[:val_last_dot] our_branch = rev_data.rev[:our_last_dot] default_rev_component = int(val[val_last_dot + 1:]) our_rev_component = int(rev_data.rev[our_last_dot + 1:]) if (default_branch == our_branch and our_rev_component <= default_rev_component): return True return False def _process_revision_data(self, rev_data): if rev_data.timestamp_was_adjusted(): # the timestamp on this revision was changed. log it for later # resynchronization of other files's revisions that occurred # for this time and log message. self.collect_data.resync.write( '%08lx %x %08lx\n' % (rev_data.original_timestamp, rev_data.metadata_id, rev_data.timestamp)) if is_branch_revision(rev_data.rev): branch_data = self.sdc.rev_to_branch_data(rev_data.rev) lod = Branch(branch_data.symbol) else: lod = Trunk() branch_ids = [ branch_data.symbol.id for branch_data in rev_data.branches_data ] tag_ids = [ tag_data.symbol.id for tag_data in rev_data.tags_data ] closed_symbol_ids = [ closed_symbol_data.symbol.id for closed_symbol_data in rev_data.closed_symbols_data ] cvs_rev = CVSRevision( self._get_rev_id(rev_data.rev), self.cvs_file, rev_data.timestamp, rev_data.metadata_id, self._get_rev_id(rev_data.parent), self._get_rev_id(rev_data.child), self._determine_operation(rev_data), rev_data.rev, rev_data.deltatext_exists, lod, rev_data.is_first_on_branch(), self._is_default_branch_revision(rev_data), tag_ids, branch_ids, closed_symbol_ids) rev_data.cvs_rev = cvs_rev self.collect_data.add_cvs_item(cvs_rev) def parse_completed(self): """Finish the processing of this file. - Create CVSRevisions for all rev_data seen. - Walk through all branches and tags and register them with their parent branch in the symbol database. This is a callback method declared in Sink.""" for rev_data in self._revision_data: self._process_revision_data(rev_data) self.collect_data.add_cvs_file(self.cvs_file) self.sdc.register_branch_blockers() # Break a circular linkage, allowing self and sdc to be freed. del self.sdc ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]') def verify_filename_legal(filename): """Verify that FILENAME does not include any control characters. If it does, raise a FatalError.""" m = ctrl_characters_regexp.search(filename) if m: raise FatalError( "Character %r in filename %r is not supported by Subversion." % (m.group(), filename,)) class _ProjectDataCollector: def __init__(self, collect_data, project): self.collect_data = collect_data self.project = project self.found_valid_file = False self.fatal_errors = [] self.num_files = 0 # A map { name -> Symbol } for all known symbols in this project. self.symbols = {} os.path.walk(self.project.project_cvs_repos_path, _ProjectDataCollector._visit_directory, self) if not self.fatal_errors and not self.found_valid_file: self.fatal_errors.append( '\n' 'No RCS files found under %r!\n' 'Are you absolutely certain you are pointing cvs2svn\n' 'at a CVS repository?\n' % self.project.project_cvs_repos_path) def get_symbol(self, name): """Return the Symbol object for the symbol named NAME in this project. If such a symbol does not yet exist, allocate a new symbol_id, create a Symbol instance, store it in self.symbols, and return it.""" symbol = self.symbols.get(name) if symbol is None: symbol = Symbol( self.collect_data.symbol_key_generator.gen_id(), self.project, name) self.symbols[name] = symbol return symbol def _process_file(self, pathname): fdc = _FileDataCollector(self, self.project.get_cvs_file(pathname)) if not fdc.cvs_file.in_attic: # If this file also exists in the attic, it's a fatal error attic_path = os.path.join( os.path.dirname(pathname), 'Attic', os.path.basename(pathname)) if os.path.exists(attic_path): err = "%s: A CVS repository cannot contain both %s and %s" \ % (error_prefix, pathname, attic_path) sys.stderr.write(err + '\n') self.fatal_errors.append(err) try: cvs2svn_rcsparse.parse(open(pathname, 'rb'), fdc) except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError): err = "%s: '%s' is not a valid ,v file" \ % (error_prefix, pathname) sys.stderr.write(err + '\n') self.fatal_errors.append(err) except: Log().warn("Exception occurred while parsing %s" % pathname) raise self.num_files += 1 def _visit_directory(self, dirname, files): for fname in files: verify_filename_legal(fname) if not fname.endswith(',v'): continue self.found_valid_file = True pathname = os.path.join(dirname, fname) Log().normal(pathname) self._process_file(pathname) class CollectData: """Repository for data collected by parsing the CVS repository files. This class manages the databases into which information collected from the CVS repository is stored. The data are stored into this class by _FileDataCollector instances, one of which is created for each file to be parsed.""" def __init__(self, stats_keeper): self._cvs_item_store = NewCVSItemStore( artifact_manager.get_temp_file(config.CVS_ITEMS_STORE)) self.resync = open( artifact_manager.get_temp_file(config.RESYNC_DATAFILE), 'w') self.metadata_db = MetadataDatabase(DB_OPEN_NEW) self.fatal_errors = [] self.num_files = 0 self.symbol_stats = SymbolStatisticsCollector() self.stats_keeper = stats_keeper # Key generator to generate unique keys for each CVSRevision object: self.key_generator = KeyGenerator() self.symbol_key_generator = KeyGenerator(1) def process_project(self, project): pdc = _ProjectDataCollector(self, project) self.num_files += pdc.num_files self.fatal_errors.extend(pdc.fatal_errors) Log().verbose('Processed', self.num_files, 'files') def add_cvs_file(self, cvs_file): """Store CVS_FILE to _cvs_file_db under its persistent id.""" Ctx()._cvs_file_db.log_file(cvs_file) def add_cvs_item(self, cvs_item): self._cvs_item_store.add(cvs_item) if isinstance(cvs_item, CVSRevision): self.stats_keeper.record_cvs_rev(cvs_item) def flush(self): self._cvs_item_store.close() self.symbol_stats.write()