# Copyright (C) 2009, Google Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following disclaimer # in the documentation and/or other materials provided with the # distribution. # * Neither the name of Google Inc. nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # WebKit's Python module for parsing and modifying ChangeLog files import codecs import fileinput # inplace file editing for set_reviewer_in_changelog import logging import re import textwrap from webkitpy.common.config.committers import CommitterList from webkitpy.common.config.committers import Account import webkitpy.common.config.urls as config_urls _log = logging.getLogger(__name__) # FIXME: parse_bug_id_from_changelog should not be a free function. # Parse the bug ID out of a Changelog message based on the format that is # used by prepare-ChangeLog def parse_bug_id_from_changelog(message): if not message: return None match = re.search("^\s*" + config_urls.bug_url_short + "$", message, re.MULTILINE) if match: return int(match.group('bug_id')) match = re.search("^\s*" + config_urls.bug_url_long + "$", message, re.MULTILINE) if match: return int(match.group('bug_id')) # We weren't able to find a bug URL in the format used by prepare-ChangeLog. Fall back to the # first bug URL found anywhere in the message. return config_urls.parse_bug_id(message) class ChangeLogEntry(object): # e.g. 2009-06-03 Eric Seidel date_line_regexp = r'^(?P\d{4}-\d{2}-\d{2})\s+(?P(?P[^<]+?)\s+<(?P[^<>]+)>.*?)$' # e.g. * Source/WebCore/page/EventHandler.cpp: Implement FooBarQuux. touched_files_regexp = r'^\s*\*\s*(?P[A-Za-z0-9_\-\./\\]+)\s*\:' # e.g. Reviewed by Darin Adler. # (Discard everything after the first period to match more invalid lines.) reviewed_by_regexp = r'^\s*((\w+\s+)+and\s+)?(Review|Rubber(\s*|-)stamp)(s|ed)?\s+([a-z]+\s+)*?by\s+(?P.*?)[\.,]?\s*$' reviewed_byless_regexp = r'^\s*((Review|Rubber(\s*|-)stamp)(s|ed)?|RS)(\s+|\s*=\s*)(?P([A-Z]\w+\s*)+)[\.,]?\s*$' reviewer_name_noise_regexp = re.compile(r""" (\s+((tweaked\s+)?and\s+)?(landed|committed|okayed)\s+by.+) # "landed by", "commented by", etc... |(^(Reviewed\s+)?by\s+) # extra "Reviewed by" or "by" |([(<]\s*[\w_\-\.]+@[\w_\-\.]+[>)]) # email addresses |([(<](https?://?bugs.)webkit.org[^>)]+[>)]) # bug url |("[^"]+") # wresler names like 'Sean/Shawn/Shaun' in 'Geoffrey "Sean/Shawn/Shaun" Garen' |('[^']+') # wresler names like "The Belly" in "Sam 'The Belly' Weinig" |((Mr|Ms|Dr|Mrs|Prof)\.(\s+|$)) """, re.IGNORECASE | re.VERBOSE) reviewer_name_casesensitive_noise_regexp = re.compile(r""" ((\s+|^)(and\s+)?([a-z-]+\s+){5,}by\s+) # e.g. "and given a good once-over by" |(\(\s*(?!(and|[A-Z])).+\)) # any parenthesis that doesn't start with "and" or a capital letter |(with(\s+[a-z-]+)+) # phrases with "with no hesitation" in "Sam Weinig with no hesitation" """, re.VERBOSE) reviewer_name_noise_needing_a_backreference_regexp = re.compile(r""" (\S\S)\.(?:(\s.+|$)) # Text after the two word characters (don't match initials) and a period followed by a space. """, re.IGNORECASE | re.VERBOSE) nobody_regexp = re.compile(r"""(\s+|^)nobody( ((,|\s+-)?\s+(\w+\s+)+fix.*) # e.g. nobody, build fix... |(\s*\([^)]+\).*) # NOBODY (..)... |$)""", re.IGNORECASE | re.VERBOSE) # e.g. == Rolled over to ChangeLog-2011-02-16 == rolled_over_regexp = r'^== Rolled over to ChangeLog-\d{4}-\d{2}-\d{2} ==$' # e.g. git-svn-id: http://svn.webkit.org/repository/webkit/trunk@96161 268f45cc-cd09-0410-ab3c-d52691b4dbfc svn_id_regexp = r'git-svn-id: http://svn.webkit.org/repository/webkit/trunk@(?P\d+) ' def __init__(self, contents, committer_list=CommitterList(), revision=None): self._contents = contents self._committer_list = committer_list self._revision = revision self._parse_entry() @staticmethod def _parse_reviewer_text(text): match = re.search(ChangeLogEntry.reviewed_by_regexp, text, re.MULTILINE | re.IGNORECASE) if not match: # There are cases where people omit "by". We match it only if reviewer part looked nice # in order to avoid matching random lines that start with Reviewed match = re.search(ChangeLogEntry.reviewed_byless_regexp, text, re.MULTILINE | re.IGNORECASE) if not match: return None, None reviewer_text = match.group("reviewer") reviewer_text = ChangeLogEntry.nobody_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_noise_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_casesensitive_noise_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_noise_needing_a_backreference_regexp.sub(r'\1', reviewer_text) reviewer_text = reviewer_text.replace('(', '').replace(')', '') reviewer_text = re.sub(r'\s\s+|[,.]\s*$', ' ', reviewer_text).strip() if not len(reviewer_text): return None, None reviewer_list = ChangeLogEntry._split_contributor_names(reviewer_text) # Get rid of "reviewers" like "even though this is just a..." in "Reviewed by Sam Weinig, even though this is just a..." # and "who wrote the original code" in "Noam Rosenthal, who wrote the original code" reviewer_list = [reviewer for reviewer in reviewer_list if not re.match('^who\s|^([a-z]+(\s+|\.|$)){6,}$', reviewer)] return reviewer_text, reviewer_list @staticmethod def _split_contributor_names(text): return re.split(r'\s*(?:,(?:\s+and\s+|&)?|(?:^|\s+)and\s+|&&|[/+&])\s*', text) def _fuzz_match_reviewers(self, reviewers_text_list): if not reviewers_text_list: return [] list_of_reviewers = [self._committer_list.contributors_by_fuzzy_match(reviewer)[0] for reviewer in reviewers_text_list] # Flatten lists and get rid of any reviewers with more than one candidate. return [reviewers[0] for reviewers in list_of_reviewers if len(reviewers) == 1] @staticmethod def _parse_author_name_and_email(author_name_and_email): match = re.match(r'(?P.+?)\s+<(?P[^>]+)>', author_name_and_email) return {'name': match.group("name"), 'email': match.group("email")} @staticmethod def _parse_author_text(text): if not text: return [] authors = ChangeLogEntry._split_contributor_names(text) assert(authors and len(authors) >= 1) return [ChangeLogEntry._parse_author_name_and_email(author) for author in authors] def _parse_entry(self): match = re.match(self.date_line_regexp, self._contents, re.MULTILINE) if not match: _log.warning("Creating invalid ChangeLogEntry:\n%s" % self._contents) # FIXME: group("name") does not seem to be Unicode? Probably due to self._contents not being unicode. self._author_text = match.group("authors") if match else None self._authors = ChangeLogEntry._parse_author_text(self._author_text) self._reviewer_text, self._reviewers_text_list = ChangeLogEntry._parse_reviewer_text(self._contents) self._reviewers = self._fuzz_match_reviewers(self._reviewers_text_list) self._author = self._committer_list.contributor_by_email(self.author_email()) or self._committer_list.contributor_by_name(self.author_name()) self._touched_files = re.findall(self.touched_files_regexp, self._contents, re.MULTILINE) def author_text(self): return self._author_text def revision(self): return self._revision def author_name(self): return self._authors[0]['name'] def author_email(self): return self._authors[0]['email'] def author(self): return self._author # Might be None def authors(self): return self._authors # FIXME: Eventually we would like to map reviwer names to reviewer objects. # See https://bugs.webkit.org/show_bug.cgi?id=26533 def reviewer_text(self): return self._reviewer_text # Might be None, might also not be a Reviewer! def reviewer(self): return self._reviewers[0] if len(self._reviewers) > 0 else None def reviewers(self): return self._reviewers def has_valid_reviewer(self): if self._reviewers_text_list: for reviewer in self._reviewers_text_list: reviewer = self._committer_list.committer_by_name(reviewer) if reviewer: return True return bool(re.search("unreviewed", self._contents, re.IGNORECASE)) def contents(self): return self._contents def bug_id(self): return parse_bug_id_from_changelog(self._contents) def touched_files(self): return self._touched_files # FIXME: Various methods on ChangeLog should move into ChangeLogEntry instead. class ChangeLog(object): def __init__(self, path): self.path = path _changelog_indent = " " * 8 @staticmethod def parse_latest_entry_from_file(changelog_file): """changelog_file must be a file-like object which returns unicode strings. Use codecs.open or StringIO(unicode()) to pass file objects to this class.""" date_line_regexp = re.compile(ChangeLogEntry.date_line_regexp) rolled_over_regexp = re.compile(ChangeLogEntry.rolled_over_regexp) entry_lines = [] # The first line should be a date line. first_line = changelog_file.readline() assert(isinstance(first_line, unicode)) if not date_line_regexp.match(first_line): return None entry_lines.append(first_line) for line in changelog_file: # If we've hit the next entry, return. if date_line_regexp.match(line) or rolled_over_regexp.match(line): # Remove the extra newline at the end return ChangeLogEntry(''.join(entry_lines[:-1])) entry_lines.append(line) return None # We never found a date line! svn_blame_regexp = re.compile(r'^(\s*(?P\d+) [^ ]+)\s*(?P.*?\n)') @staticmethod def _separate_revision_and_line(line): match = ChangeLog.svn_blame_regexp.match(line) if not match: return None, line return int(match.group('revision')), match.group('line') @staticmethod def parse_entries_from_file(changelog_file): """changelog_file must be a file-like object which returns unicode strings. Use codecs.open or StringIO(unicode()) to pass file objects to this class.""" date_line_regexp = re.compile(ChangeLogEntry.date_line_regexp) rolled_over_regexp = re.compile(ChangeLogEntry.rolled_over_regexp) # The first line should be a date line. revision, first_line = ChangeLog._separate_revision_and_line(changelog_file.readline()) assert(isinstance(first_line, unicode)) if not date_line_regexp.match(ChangeLog.svn_blame_regexp.sub('', first_line)): raise StopIteration entry_lines = [first_line] revisions_in_entry = {revision: 1} if revision != None else None for line in changelog_file: if revisions_in_entry: revision, line = ChangeLog._separate_revision_and_line(line) if rolled_over_regexp.match(line): break if date_line_regexp.match(line): most_probable_revision = max(revisions_in_entry, key=revisions_in_entry.__getitem__) if revisions_in_entry else None # Remove the extra newline at the end yield ChangeLogEntry(''.join(entry_lines[:-1]), revision=most_probable_revision) entry_lines = [] revisions_in_entry = {revision: 0} entry_lines.append(line) if revisions_in_entry: revisions_in_entry[revision] = revisions_in_entry.get(revision, 0) + 1 most_probable_revision = max(revisions_in_entry, key=revisions_in_entry.__getitem__) if revisions_in_entry else None yield ChangeLogEntry(''.join(entry_lines[:-1]), revision=most_probable_revision) def latest_entry(self): # ChangeLog files are always UTF-8, we read them in as such to support Reviewers with unicode in their names. changelog_file = codecs.open(self.path, "r", "utf-8") try: return self.parse_latest_entry_from_file(changelog_file) finally: changelog_file.close() # _wrap_line and _wrap_lines exist to work around # http://bugs.python.org/issue1859 def _wrap_line(self, line): return textwrap.fill(line, width=70, initial_indent=self._changelog_indent, # Don't break urls which may be longer than width. break_long_words=False, subsequent_indent=self._changelog_indent) # Workaround as suggested by guido in # http://bugs.python.org/issue1859#msg60040 def _wrap_lines(self, message): lines = [self._wrap_line(line) for line in message.splitlines()] return "\n".join(lines) def update_with_unreviewed_message(self, message): first_boilerplate_line_regexp = re.compile( "%sNeed a short description \(OOPS!\)\." % self._changelog_indent) removing_boilerplate = False # inplace=1 creates a backup file and re-directs stdout to the file for line in fileinput.FileInput(self.path, inplace=1): if first_boilerplate_line_regexp.search(line): message_lines = self._wrap_lines(message) print first_boilerplate_line_regexp.sub(message_lines, line), # Remove all the ChangeLog boilerplate before the first changed # file. removing_boilerplate = True elif removing_boilerplate: if line.find('*') >= 0: # each changed file is preceded by a * removing_boilerplate = False if not removing_boilerplate: print line, def set_reviewer(self, reviewer): latest_entry = self.latest_entry() latest_entry_contents = latest_entry.contents() reviewer_text = latest_entry.reviewer() found_nobody = re.search("NOBODY\s*\(OOPS!\)", latest_entry_contents, re.MULTILINE) if not found_nobody and not reviewer_text: bug_url_number_of_items = len(re.findall(config_urls.bug_url_long, latest_entry_contents, re.MULTILINE)) bug_url_number_of_items += len(re.findall(config_urls.bug_url_short, latest_entry_contents, re.MULTILINE)) for line in fileinput.FileInput(self.path, inplace=1): found_bug_url = re.search(config_urls.bug_url_long, line) if not found_bug_url: found_bug_url = re.search(config_urls.bug_url_short, line) print line, if found_bug_url: if bug_url_number_of_items == 1: print "\n Reviewed by %s." % (reviewer.encode("utf-8")) bug_url_number_of_items -= 1 else: # inplace=1 creates a backup file and re-directs stdout to the file for line in fileinput.FileInput(self.path, inplace=1): # Trailing comma suppresses printing newline print line.replace("NOBODY (OOPS!)", reviewer.encode("utf-8")), def set_short_description_and_bug_url(self, short_description, bug_url): message = "%s\n%s%s" % (short_description, self._changelog_indent, bug_url) bug_boilerplate = "%sNeed the bug URL (OOPS!).\n" % self._changelog_indent for line in fileinput.FileInput(self.path, inplace=1): line = line.replace("Need a short description (OOPS!).", message.encode("utf-8")) if line != bug_boilerplate: print line,