diff --git a/dev/release/README.md b/dev/release/README.md index 1ba44fb50..e85b288f4 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -56,28 +56,37 @@ Before creating a new release: - a PR should be created and merged to update the major version number of the project - A new release branch should be created, such as `branch-0.8` -### Update CHANGELOG.md +### Change Log -Define release branch (e.g. `branch-0.8`), base version tag (e.g. `0.7.0`) and future version tag (e.g. `0.9.0`). Commits -between the base version tag and the release branch will be used to populate the changelog content. +We maintain a `CHANGELOG.md` so our users know what has been changed between releases. + +The changelog is generated using a Python script: ```bash -# create the changelog -CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log-datafusion-python.sh main 0.8.0 0.7.0 -# review change log / edit issues and labels if needed, rerun until you are happy with the result -git commit -a -m 'Create changelog for release' +$ GITHUB_TOKEN= ./dev/release/generate-changelog.py apache/arrow-datafusion-python 24.0.0 HEAD > dev/changelog/25.0.0.md +``` + +This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for +titles starting with `feat:`, `fix:`, or `docs:` . The script will produce output similar to: + ``` +Fetching list of commits between 24.0.0 and HEAD +Fetching pull requests +Categorizing pull requests +Generating changelog content +``` + +This process is not fully automated, so there are some additional manual steps: -_If you see the error `"You have exceeded a secondary rate limit"` when running this script, try reducing the CPU -allocation to slow the process down and throttle the number of GitHub requests made per minute, by modifying the -value of the `--cpus` argument in the `update_change_log.sh` script._ +- Add the ASF header to the generated file +- Add a link to this changelog from the top-level `/datafusion/CHANGELOG.md` +- Add the following content (copy from the previous version's changelog and update as appropriate: -You can add `invalid` or `development-process` label to exclude items from -release notes. +``` +## [24.0.0](https://github.com/apache/arrow-datafusion-python/tree/24.0.0) (2023-05-06) -Send a PR to get these changes merged into the release branch (e.g. `branch-0.8`). If new commits that could change the -change log content landed in the release branch before you could merge the PR, you need to rerun the changelog update -script to regenerate the changelog and update the PR accordingly. +[Full Changelog](https://github.com/apache/arrow-datafusion-python/compare/23.0.0...24.0.0) +``` ### Preparing a Release Candidate diff --git a/dev/release/generate-changelog.md b/dev/release/generate-changelog.md new file mode 100644 index 000000000..caa6ae647 --- /dev/null +++ b/dev/release/generate-changelog.md @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from github import Github +import os +import re + + +def print_pulls(repo_name, title, pulls): +if len(pulls) > 0: +print("**{}:**".format(title)) +print() +for (pull, commit) in pulls: +url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) +print("- {} [#{}]({}) ({})".format(pull.title, pull.number, url, commit.author.login)) +print() + + +def generate_changelog(repo, repo_name, tag1, tag2): + + # get a list of commits between two tags + print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) + comparison = repo.compare(tag1, tag2) + + # get the pull requests for these commits + print("Fetching pull requests", file=sys.stderr) + unique_pulls = [] + all_pulls = [] + for commit in comparison.commits: + pulls = commit.get_pulls() + for pull in pulls: + # there can be multiple commits per PR if squash merge is not being used and + # in this case we should get all the author names, but for now just pick one + if pull.number not in unique_pulls: + unique_pulls.append(pull.number) + all_pulls.append((pull, commit)) + + # we split the pulls into categories + #TODO: make categories configurable + breaking = [] + bugs = [] + docs = [] + enhancements = [] + + # categorize the pull requests based on GitHub labels + print("Categorizing pull requests", file=sys.stderr) + for (pull, commit) in all_pulls: + + # see if PR title uses Conventional Commits + cc_type = '' + cc_scope = '' + cc_breaking = '' + parts = re.findall(r'^([a-z]+)(\([a-z]+\))?(!)?:', pull.title) + if len(parts) == 1: + parts_tuple = parts[0] + cc_type = parts_tuple[0] # fix, feat, docs, chore + cc_scope = parts_tuple[1] # component within project + cc_breaking = parts_tuple[2] == '!' + + labels = [label.name for label in pull.labels] + #print(pull.number, labels, parts, file=sys.stderr) + if 'api change' in labels or cc_breaking: + breaking.append((pull, commit)) + elif 'bug' in labels or cc_type == 'fix': + bugs.append((pull, commit)) + elif 'enhancement' in labels or cc_type == 'feat': + enhancements.append((pull, commit)) + elif 'documentation' in labels or cc_type == 'docs': + docs.append((pull, commit)) + + # produce the changelog content + print("Generating changelog content", file=sys.stderr) + print_pulls(repo_name, "Breaking changes", breaking) + print_pulls(repo_name, "Implemented enhancements", enhancements) + print_pulls(repo_name, "Fixed bugs", bugs) + print_pulls(repo_name, "Documentation updates", docs) + print_pulls(repo_name, "Merged pull requests", all_pulls) + + +def cli(args=None): +"""Process command line arguments.""" +if not args: +args = sys.argv[1:] + + parser = argparse.ArgumentParser() + parser.add_argument("project", help="The project name e.g. apache/arrow-datafusion-python") + parser.add_argument("tag1", help="The previous release tag") + parser.add_argument("tag2", help="The current release tag") + args = parser.parse_args() + + token = os.getenv("GITHUB_TOKEN") + + g = Github(token) + repo = g.get_repo(args.project) + generate_changelog(repo, args.project, args.tag1, args.tag2) + +if __name__ == "__main__": +cli() \ No newline at end of file