# Copyright (C) 2009, Google Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following disclaimer # in the documentation and/or other materials provided with the # distribution. # * Neither the name of Google Inc. nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # WebKit's Python module for parsing and modifying ChangeLog files import logging import re from StringIO import StringIO import textwrap from webkitpy.common.config.committers import CommitterList from webkitpy.common.system.filesystem import FileSystem import webkitpy.common.config.urls as config_urls _log = logging.getLogger(__name__) # FIXME: parse_bug_id_from_changelog should not be a free function. # Parse the bug ID out of a Changelog message based on the format that is # used by prepare-ChangeLog def parse_bug_id_from_changelog(message): if not message: return None match = re.search("^\s*" + config_urls.bug_url_short + "$", message, re.MULTILINE) if match: return int(match.group('bug_id')) match = re.search("^\s*" + config_urls.bug_url_long + "$", message, re.MULTILINE) if match: return int(match.group('bug_id')) # We weren't able to find a bug URL in the format used by prepare-ChangeLog. Fall back to the # first bug URL found anywhere in the message. return config_urls.parse_bug_id(message) class ChangeLogEntry(object): # e.g. 2009-06-03 Eric Seidel date_line_regexp = r'^(?P\d{4}-\d{2}-\d{2})\s+(?P(?P[^<]+?)\s+<(?P[^<>]+)>.*?)$' # e.g. * Source/WebCore/page/EventHandler.cpp: Implement FooBarQuux. touched_files_regexp = r'^\s*\*\s*(?P[A-Za-z0-9_\-\./\\]+)\s*\:' # e.g. (ChangeLogEntry.touched_functions): Added. touched_functions_regexp = r'^\s*\((?P[^)]*)\):' # e.g. Reviewed by Darin Adler. # (Discard everything after the first period to match more invalid lines.) reviewed_by_regexp = r'^\s*((\w+\s+)+and\s+)?(Review|Rubber(\s*|-)stamp)(s|ed)?\s+([a-z]+\s+)*?by\s+(?P.*?)[\.,]?\s*$' reviewed_byless_regexp = r'^\s*((Review|Rubber(\s*|-)stamp)(s|ed)?|RS)(\s+|\s*=\s*)(?P([A-Z]\w+\s*)+)[\.,]?\s*$' reviewer_name_noise_regexp = re.compile(r""" (\s+((tweaked\s+)?and\s+)?(landed|committed|okayed)\s+by.+) # "landed by", "commented by", etc... |(^(Reviewed\s+)?by\s+) # extra "Reviewed by" or "by" |([(<]\s*[\w_\-\.]+@[\w_\-\.]+[>)]) # email addresses |([(<](https?://?bugs.)webkit.org[^>)]+[>)]) # bug url |("[^"]+") # wresler names like 'Sean/Shawn/Shaun' in 'Geoffrey "Sean/Shawn/Shaun" Garen' |('[^']+') # wresler names like "The Belly" in "Sam 'The Belly' Weinig" |((Mr|Ms|Dr|Mrs|Prof)\.(\s+|$)) """, re.IGNORECASE | re.VERBOSE) reviewer_name_casesensitive_noise_regexp = re.compile(r""" ((\s+|^)(and\s+)?([a-z-]+\s+){5,}by\s+) # e.g. "and given a good once-over by" |(\(\s*(?!(and|[A-Z])).+\)) # any parenthesis that doesn't start with "and" or a capital letter |(with(\s+[a-z-]+)+) # phrases with "with no hesitation" in "Sam Weinig with no hesitation" """, re.VERBOSE) reviewer_name_noise_needing_a_backreference_regexp = re.compile(r""" (\S\S)\.(?:(\s.+|$)) # Text after the two word characters (don't match initials) and a period followed by a space. """, re.IGNORECASE | re.VERBOSE) nobody_regexp = re.compile(r"""(\s+|^)nobody( ((,|\s+-)?\s+(\w+\s+)+fix.*) # e.g. nobody, build fix... |(\s*\([^)]+\).*) # NOBODY (..)... |$)""", re.IGNORECASE | re.VERBOSE) # e.g. == Rolled over to ChangeLog-2011-02-16 == rolled_over_regexp = r'^== Rolled over to ChangeLog-\d{4}-\d{2}-\d{2} ==$' # e.g. git-svn-id: http://svn.webkit.org/repository/webkit/trunk@96161 268f45cc-cd09-0410-ab3c-d52691b4dbfc svn_id_regexp = r'git-svn-id: http://svn.webkit.org/repository/webkit/trunk@(?P\d+) ' split_names_regexp = r'\s*(?:,(?:\s+and\s+|&)?|(?:^|\s+)and\s+|&&|[/+&])\s*' def __init__(self, contents, committer_list=CommitterList(), revision=None): self._contents = contents self._committer_list = committer_list self._revision = revision self._parse_entry() @classmethod def _parse_reviewer_text(cls, text): match = re.search(ChangeLogEntry.reviewed_by_regexp, text, re.MULTILINE | re.IGNORECASE) if not match: # There are cases where people omit "by". We match it only if reviewer part looked nice # in order to avoid matching random lines that start with Reviewed match = re.search(ChangeLogEntry.reviewed_byless_regexp, text, re.MULTILINE | re.IGNORECASE) if not match: return None, None reviewer_text = match.group("reviewer") reviewer_text = ChangeLogEntry.nobody_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_noise_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_casesensitive_noise_regexp.sub('', reviewer_text) reviewer_text = ChangeLogEntry.reviewer_name_noise_needing_a_backreference_regexp.sub(r'\1', reviewer_text) reviewer_text = reviewer_text.replace('(', '').replace(')', '') reviewer_text = re.sub(r'\s\s+|[,.]\s*$', ' ', reviewer_text).strip() if not len(reviewer_text): return None, None reviewer_list = ChangeLogEntry._split_reviewer_names(reviewer_text) # Get rid of "reviewers" like "even though this is just a..." in "Reviewed by Sam Weinig, even though this is just a..." # and "who wrote the original code" in "Noam Rosenthal, who wrote the original code" reviewer_list = [reviewer for reviewer in reviewer_list if not re.match('^who\s|^([a-z]+(\s+|\.|$)){6,}$', reviewer)] return reviewer_text, reviewer_list @classmethod def _split_reviewer_names(cls, text): return re.split(ChangeLogEntry.split_names_regexp, text) @classmethod def _split_author_names_with_emails(cls, text): regex = '>' + ChangeLogEntry.split_names_regexp names = re.split(regex, text) if len(names) > 1: names = [name + ">" for name in names[:-1]] + [names[-1]] return names def _fuzz_match_reviewers(self, reviewers_text_list): if not reviewers_text_list: return [] list_of_reviewers = [self._committer_list.contributors_by_fuzzy_match(reviewer)[0] for reviewer in reviewers_text_list] # Flatten lists and get rid of any reviewers with more than one candidate. return [reviewers[0] for reviewers in list_of_reviewers if len(reviewers) == 1] @classmethod def _parse_author_name_and_email(cls, author_name_and_email): match = re.match(r'(?P.+?)\s+<(?P[^>]+)>', author_name_and_email) return {'name': match.group("name"), 'email': match.group("email")} @classmethod def _parse_author_text(cls, text): if not text: return [] authors = cls._split_author_names_with_emails(text) assert(authors and len(authors) >= 1) return [cls._parse_author_name_and_email(author) for author in authors] @classmethod def _parse_touched_functions(cls, text): result = {} cur_file = None for line in text.splitlines(): file_match = re.match(cls.touched_files_regexp, line) if file_match: cur_file = file_match.group("file") result[cur_file] = [] func_match = re.match(cls.touched_functions_regexp, line) if func_match and cur_file: result[cur_file].append(func_match.group("function")) return result @classmethod def _parse_bug_description(cls, text): # Line 3 is the bug description in most cases. lines = text.splitlines() if len(lines) < 3: return None found_reviewed = re.search(ChangeLogEntry.reviewed_by_regexp, lines[2], re.IGNORECASE) found_reviewed_byless = re.search(ChangeLogEntry.reviewed_byless_regexp, lines[2], re.IGNORECASE) found_url = parse_bug_id_from_changelog(lines[2]) if found_reviewed or found_reviewed_byless or found_url: return None return lines[2].strip() def _parse_entry(self): match = re.match(self.date_line_regexp, self._contents, re.MULTILINE) if not match: _log.warning("Creating invalid ChangeLogEntry:\n%s" % self._contents) self._date_line = match.group() self._date = match.group("date") self._bug_description = self._parse_bug_description(self._contents) # FIXME: group("name") does not seem to be Unicode? Probably due to self._contents not being unicode. self._author_text = match.group("authors") if match else None self._authors = ChangeLogEntry._parse_author_text(self._author_text) self._reviewer_text, self._reviewers_text_list = ChangeLogEntry._parse_reviewer_text(self._contents) self._reviewers = self._fuzz_match_reviewers(self._reviewers_text_list) self._author = self._committer_list.contributor_by_email(self.author_email()) or self._committer_list.contributor_by_name(self.author_name()) self._touched_files = re.findall(self.touched_files_regexp, self._contents, re.MULTILINE) self._touched_functions = self._parse_touched_functions(self._contents) def date_line(self): return self._date_line def date(self): return self._date def author_text(self): return self._author_text def revision(self): return self._revision def author_name(self): return self._authors[0]['name'] def author_email(self): return self._authors[0]['email'] def author(self): return self._author # Might be None def authors(self): return self._authors # FIXME: Eventually we would like to map reviwer names to reviewer objects. # See https://bugs.webkit.org/show_bug.cgi?id=26533 def reviewer_text(self): return self._reviewer_text # Might be None, might also not be a Reviewer! def reviewer(self): return self._reviewers[0] if len(self._reviewers) > 0 else None def reviewers(self): return self._reviewers def has_valid_reviewer(self): if self._reviewers_text_list: for reviewer in self._reviewers_text_list: reviewer = self._committer_list.reviewer_by_name(reviewer) if reviewer: return True return bool(re.search("unreviewed", self._contents, re.IGNORECASE)) def contents(self): return self._contents def bug_id(self): return parse_bug_id_from_changelog(self._contents) def bug_description(self): return self._bug_description def touched_files(self): return self._touched_files # Returns a dict from file name to lists of function names. def touched_functions(self): return self._touched_functions def touched_files_text(self): match = re.search(self.touched_files_regexp, self._contents, re.MULTILINE) return self._contents[match.start():].lstrip("\n\r") if match else "" # Determine if any text has been added to the section on touched files def is_touched_files_text_clean(self): file_line_end = r"( (Added|Removed|(Copied|Renamed) from [A-Za-z0-9_\-./\\]+).)?$" for line in self.touched_files_text().splitlines(): if re.match(self.touched_files_regexp + file_line_end, line): continue if re.match(self.touched_functions_regexp + "$", line): continue return False return True # FIXME: Various methods on ChangeLog should move into ChangeLogEntry instead. class ChangeLog(object): def __init__(self, path, filesystem=None): self.path = path self._filesystem = filesystem or FileSystem() _changelog_indent = " " * 8 @classmethod def parse_latest_entry_from_file(cls, changelog_file): try: return next(cls.parse_entries_from_file(changelog_file)) except StopIteration, e: return None svn_blame_regexp = re.compile(r'^(\s*(?P\d+) [^ ]+)\s*(?P.*?\n)') @classmethod def _separate_revision_and_line(cls, line): match = cls.svn_blame_regexp.match(line) if not match: return None, line return int(match.group('revision')), match.group('line') @classmethod def parse_entries_from_file(cls, changelog_file): """changelog_file must be a file-like object which returns unicode strings, e.g. from StringIO(unicode()) or fs.open_text_file_for_reading()""" date_line_regexp = re.compile(ChangeLogEntry.date_line_regexp) rolled_over_regexp = re.compile(ChangeLogEntry.rolled_over_regexp) # The first line should be a date line. revision, first_line = cls._separate_revision_and_line(changelog_file.readline()) assert(isinstance(first_line, unicode)) if not date_line_regexp.match(cls.svn_blame_regexp.sub('', first_line)): raise StopIteration entry_lines = [first_line] revisions_in_entry = {revision: 1} if revision != None else None for line in changelog_file: if revisions_in_entry: revision, line = cls._separate_revision_and_line(line) if rolled_over_regexp.match(line): break if date_line_regexp.match(line): most_probable_revision = max(revisions_in_entry, key=revisions_in_entry.__getitem__) if revisions_in_entry else None # Remove the extra newline at the end yield ChangeLogEntry(''.join(entry_lines[:-1]), revision=most_probable_revision) entry_lines = [] revisions_in_entry = {revision: 0} entry_lines.append(line) if revisions_in_entry: revisions_in_entry[revision] = revisions_in_entry.get(revision, 0) + 1 most_probable_revision = max(revisions_in_entry, key=revisions_in_entry.__getitem__) if revisions_in_entry else None yield ChangeLogEntry(''.join(entry_lines[:-1]), revision=most_probable_revision) def latest_entry(self): # ChangeLog files are always UTF-8, we read them in as such to support Reviewers with unicode in their names. changelog_file = self._filesystem.open_text_file_for_reading(self.path) try: return self.parse_latest_entry_from_file(changelog_file) finally: changelog_file.close() # _wrap_line and _wrap_lines exist to work around # http://bugs.python.org/issue1859 def _wrap_line(self, line): return textwrap.fill(line, width=70, initial_indent=self._changelog_indent, # Don't break urls which may be longer than width. break_long_words=False, subsequent_indent=self._changelog_indent) # Workaround as suggested by guido in # http://bugs.python.org/issue1859#msg60040 def _wrap_lines(self, message): lines = [self._wrap_line(line) for line in message.splitlines()] return "\n".join(lines) def update_with_unreviewed_message(self, message): first_boilerplate_line_regexp = re.compile( "%sNeed a short description \(OOPS!\)\." % self._changelog_indent) removing_boilerplate = False result = StringIO() with self._filesystem.open_text_file_for_reading(self.path) as file: for line in file: if first_boilerplate_line_regexp.search(line): message_lines = self._wrap_lines(message) result.write(first_boilerplate_line_regexp.sub(message_lines, line)) # Remove all the ChangeLog boilerplate, except the first line (date, name, e-mail). removing_boilerplate = True elif removing_boilerplate: if re.search("^[1-9]", line): # each changelog entry is preceded by a date removing_boilerplate = False if not removing_boilerplate: result.write(line) self._filesystem.write_text_file(self.path, result.getvalue()) def set_reviewer(self, reviewer): latest_entry = self.latest_entry() latest_entry_contents = latest_entry.contents() reviewer_text = latest_entry.reviewer() found_nobody = re.search("NOBODY\s*\(OOPS!\)", latest_entry_contents, re.MULTILINE) found_reviewer_or_unreviewed = latest_entry.has_valid_reviewer() if not found_nobody and not found_reviewer_or_unreviewed and not reviewer_text: bug_url_number_of_items = len(re.findall(config_urls.bug_url_long, latest_entry_contents, re.MULTILINE)) bug_url_number_of_items += len(re.findall(config_urls.bug_url_short, latest_entry_contents, re.MULTILINE)) result = StringIO() with self._filesystem.open_text_file_for_reading(self.path) as file: for line in file: found_bug_url = re.search(config_urls.bug_url_long, line) if not found_bug_url: found_bug_url = re.search(config_urls.bug_url_short, line) result.write(line) if found_bug_url: if bug_url_number_of_items == 1: result.write("\n Reviewed by %s.\n" % reviewer) bug_url_number_of_items -= 1 self._filesystem.write_text_file(self.path, result.getvalue()) else: data = self._filesystem.read_text_file(self.path) newdata = data.replace("NOBODY (OOPS!)", reviewer) self._filesystem.write_text_file(self.path, newdata) def set_short_description_and_bug_url(self, short_description, bug_url): message = "%s\n%s%s" % (short_description, self._changelog_indent, bug_url) bug_boilerplate = "%sNeed the bug URL (OOPS!).\n" % self._changelog_indent result = StringIO() with self._filesystem.open_text_file_for_reading(self.path) as file: for line in file: line = line.replace("Need a short description (OOPS!).", message) if line != bug_boilerplate: result.write(line) self._filesystem.write_text_file(self.path, result.getvalue()) def delete_entries(self, num_entries): date_line_regexp = re.compile(ChangeLogEntry.date_line_regexp) rolled_over_regexp = re.compile(ChangeLogEntry.rolled_over_regexp) entries = 0 result = StringIO() with self._filesystem.open_text_file_for_reading(self.path) as file: for line in file: if date_line_regexp.match(line): entries += 1 elif rolled_over_regexp.match(line): entries = num_entries + 1 if entries > num_entries: result.write(line) self._filesystem.write_text_file(self.path, result.getvalue()) def prepend_text(self, text): data = self._filesystem.read_text_file(self.path) self._filesystem.write_text_file(self.path, text + data)