From 7a07b71a5f8c61cda8684ac94a23003fccf37f35 Mon Sep 17 00:00:00 2001 From: Nathan Thomas Date: Sun, 19 May 2024 22:51:39 -0700 Subject: [PATCH] Add auto-PR description action --- .github/PULL_REQUEST_TEMPLATE.md | 7 + .../generate_pull_request_description.yaml | 43 +++++ LICENSE | 21 +++ ai/.gitignore | 162 ++++++++++++++++++ ai/main.py | 60 +++++++ ai/requirements.txt | 77 +++++++++ ai/setup.py | 11 ++ ai/src/constants.py | 10 ++ ai/src/utility.py | 118 +++++++++++++ 9 files changed, 509 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/generate_pull_request_description.yaml create mode 100644 LICENSE create mode 100644 ai/.gitignore create mode 100644 ai/main.py create mode 100644 ai/requirements.txt create mode 100644 ai/setup.py create mode 100644 ai/src/constants.py create mode 100644 ai/src/utility.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..ca4cf42 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,7 @@ +# Problem + +# Solution + +# Testing + +# Reverting diff --git a/.github/workflows/generate_pull_request_description.yaml b/.github/workflows/generate_pull_request_description.yaml new file mode 100644 index 0000000..f4815d0 --- /dev/null +++ b/.github/workflows/generate_pull_request_description.yaml @@ -0,0 +1,43 @@ +name: Generate PR Description + +on: + pull_request: + types: [opened, synchronize] + +jobs: + generate-pull-request-description: + name: Write PR Description + runs-on: ubuntu-latest + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + + - name: Setup Python + uses: actions/setup-python@v3 + with: + python-version: "3.12" + + - name: Install Dependencies + run: | + cd ./ai + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Get PR Info + id: pr_info + run: | + pr_num=${{ github.event.pull_request.number }} + echo "pull_request_number=$pr_num" >> $GITHUB_ENV + + - name: Generate PR Description + env: + BRANCH_NAME: ${{ github.event.pull_request.head.ref }} + GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + PR_NUMBER: ${{ env.pull_request_number }} + REPO_PATH: ${{ github.repository }} + run: python ./ai/main.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e329dde --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Nathan Thomas + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ai/.gitignore b/ai/.gitignore new file mode 100644 index 0000000..82f9275 --- /dev/null +++ b/ai/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/ai/main.py b/ai/main.py new file mode 100644 index 0000000..5120c8a --- /dev/null +++ b/ai/main.py @@ -0,0 +1,60 @@ +from github import Github +from src.constants import * +from src.utility import * +import random +import json +import sys + +def main(): + # Initialize PyGithub + g = Github(GITHUB_API_KEY) + + # Fetch repository + repo = g.get_repo(REPO_PATH) + pull_request = repo.get_pull(PULL_REQUEST_NUMBER) + + # Get pull request template content + root_dir = os.getcwd() + file_path = os.path.join(root_dir, PULL_REQUEST_TEMPLATE_PATH) + with open(file_path, "r") as file: + pull_request_description_template = file.read() + + # Get all current pull request diffs + pull_request_diffs = [ + { + "filename": file.filename, + "patch": file.patch + } + for file in pull_request.get_files() + ] + + # Get all current pull request commit messages and convert to JSON + commit_messages = [commit.commit.message for commit in pull_request.get_commits()] + json_pull_request_diffs = json.dumps(pull_request_diffs) + + # Do embeddings on codebase + current_index_name = "autopr" + embed_documents(repo, current_index_name) + + # Search embeddings + codebase_context = get_embeddings_for_diffs(current_index_name, pull_request_diffs) + + # Delete index and codebase embeddings + delete_embeddings_for_codebase(current_index_name) + + # Build prompt + prompt = format_data_for_prompt( + pull_request_diffs, + commit_messages, + codebase_context, + pull_request_description_template, + ) + + # Call model for PR description + pr_description = generate_pr_description(prompt) + + # Update PR description + update_pr_description(repo, PULL_REQUEST_NUMBER, pr_description) + +if __name__ == '__main__': + main() diff --git a/ai/requirements.txt b/ai/requirements.txt new file mode 100644 index 0000000..e423ddb --- /dev/null +++ b/ai/requirements.txt @@ -0,0 +1,77 @@ +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.3.0 +attrs==23.2.0 +backoff==2.2.1 +beautifulsoup4==4.12.3 +certifi==2024.2.2 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cryptography==42.0.7 +dataclasses-json==0.6.6 +deepdiff==7.0.1 +Deprecated==1.2.14 +distro==1.9.0 +emoji==2.11.1 +filetype==1.2.0 +frozenlist==1.4.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.0 +idna==3.7 +joblib==1.4.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +jsonpointer==2.4 +langchain==0.2.0 +langchain-community==0.2.0 +langchain-core==0.2.0 +langchain-openai==0.1.7 +langchain-pinecone==0.1.1 +langchain-text-splitters==0.2.0 +langdetect==1.0.9 +langsmith==0.1.59 +lxml==5.2.2 +Markdown==3.6 +marshmallow==3.21.2 +multidict==6.0.5 +mypy-extensions==1.0.0 +nltk==3.8.1 +numpy==1.26.4 +openai==1.30.1 +ordered-set==4.1.0 +orjson==3.10.3 +packaging==23.2 +pinecone-client==3.2.2 +pycparser==2.22 +pydantic==2.7.1 +pydantic_core==2.18.2 +PyGithub==2.3.0 +PyJWT==2.8.0 +PyNaCl==1.5.0 +pypdf==4.2.0 +python-dateutil==2.9.0.post0 +python-iso639==2024.4.27 +python-magic==0.4.27 +PyYAML==6.0.1 +rapidfuzz==3.9.1 +regex==2024.5.15 +requests==2.31.0 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.5 +SQLAlchemy==2.0.30 +tabulate==0.9.0 +tenacity==8.3.0 +tiktoken==0.7.0 +tqdm==4.66.4 +typing-inspect==0.9.0 +typing_extensions==4.11.0 +unstructured==0.11.8 +unstructured-client==0.22.0 +urllib3==2.2.1 +wrapt==1.16.0 +yarl==1.9.4 diff --git a/ai/setup.py b/ai/setup.py new file mode 100644 index 0000000..3932e45 --- /dev/null +++ b/ai/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, find_packages + +with open("requirements.txt") as f: + requirements = f.read().splitlines() + +setup( + name="ai", + version="0.0.0-alpha", + packages=find_packages(), + install_requires=requirements, +) diff --git a/ai/src/constants.py b/ai/src/constants.py new file mode 100644 index 0000000..0258369 --- /dev/null +++ b/ai/src/constants.py @@ -0,0 +1,10 @@ +import os + +BRANCH_NAME = os.getenv("BRANCH_NAME", "") +EMBEDDING_MODEL="text-embedding-3-large" +GITHUB_API_KEY = os.getenv("GITHUB_TOKEN", "") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") +PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "") +PULL_REQUEST_NUMBER = int(os.getenv('PR_NUMBER', "0")) +PULL_REQUEST_TEMPLATE_PATH = ".github/PULL_REQUEST_TEMPLATE.md" +REPO_PATH = os.getenv('REPO_PATH', "") diff --git a/ai/src/utility.py b/ai/src/utility.py new file mode 100644 index 0000000..73f6aec --- /dev/null +++ b/ai/src/utility.py @@ -0,0 +1,118 @@ +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.output_parsers.string import StrOutputParser +from langchain_community.document_loaders import DirectoryLoader +from langchain_pinecone import PineconeVectorStore +from pinecone import Pinecone, ServerlessSpec +from langchain_openai import OpenAIEmbeddings +from langchain_openai import ChatOpenAI +from .constants import * +from typing import Any, List +import base64 +import json +import os + +def load_documents(repo: Any) -> List[str]: + """Loads PDF documents to be used in embeddings in a vector store""" + repo.get_branch("main") + loader = DirectoryLoader("./", glob="*.*") + raw_documents = loader.load() + + return raw_documents + +def get_split_documents(raw_documents: List[str]) -> List[str]: + """Chunks codebase documents to be used in embeddings in a vector store""" + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) + split_documents = text_splitter.split_documents(raw_documents) + + return split_documents + +def embed_documents(repo: Any, index_name: str) -> None: + """Embeds chunked documents in Pinecone's vector store after creating a new index""" + raw_documents = load_documents(repo) + split_documents = get_split_documents(raw_documents) + embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL) + + pc = Pinecone(api_key=PINECONE_API_KEY) + pc.create_index( + name=index_name, + dimension=3072, + metric="cosine", + spec=ServerlessSpec( + cloud="aws", + region="us-east-1" + ), + ) + + PineconeVectorStore.from_documents( + documents=split_documents, + embedding=embeddings, + index_name=index_name + ) + +def get_embeddings_for_diffs(index_name: str, diffs: List[str]) -> List[str]: + """Takes a list of diffs from a PR, stringifies them, and does embedded search""" + embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL) + document_vectorstore = PineconeVectorStore( + index_name=index_name, + embedding=embeddings + ) + + retriever = document_vectorstore.as_retriever() + + json_pull_request_diffs = json.dumps(diffs) + + return retriever.invoke(json_pull_request_diffs) + +def delete_embeddings_for_codebase(index_name: List[str]) -> str: + """Deletes an index from the Pinecone account""" + pc = Pinecone(api_key=PINECONE_API_KEY) + pc.delete_index(index_name) + +def format_data_for_prompt(diffs, commit_messages, codebase_context, pull_request_description_template): + """Formats contextual data and generates the prompt with it, including for system data""" + + # Combine the changes into a string with clear delineation. + changes = '\n'.join([ + f'File: {file["filename"]}\nDiff: \n{file["patch"]}\n' + for file in diffs + ]) + + # Combine all commit messages + commit_messages = '\n'.join(commit_messages) + '\n\n' + + # Construct the prompt with clear instructions for the LLM. + prompt = ( + "Please review the following code changes and commit messages from a GitHub pull request:\n" + "Code changes from Pull Request:\n" + f"{changes}\n" + "Commit messages:\n" + f"{commit_messages}" + "Here's context from existing files in the codebase:" + f"{codebase_context}" + "Consider the code changes, commit messages, and codebase context given above, write a pull request description. Be concise with no yapping, and just summarize bullet points of what's changed instead of listing all file paths and what changed inside each of them. Keep all of this relatively high level.\n" + "Here is the markdown pull request template. Only use the sections in this content and do not add any others. Do not add a new Pull Request Description header. Keep the markdown heirarchy as it is in the content you've been given, and don't add any other headers. Do not add a new title for the template.\n" + f"{pull_request_description_template}\n" + "Pull request description:\n" + ) + + return prompt + +def generate_pr_description(prompt: str) -> str: + """Uses a model (currently OpenAI) to generate a new pull request description""" + client = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o") + + messages = [ + {"role": "system", "content": "You are an expert code reviewer tasked with summarizing all changes in a pull request and writing an incredible description for it."}, + {"role": "user", "content": prompt} + ] + + response = client.invoke(input=messages) + parser = StrOutputParser() + content = parser.invoke(input=response) + + return content + +def update_pr_description(repo: Any, pull_request_number: int, pull_request_description: str) -> None: + """Updates a given pull request at a given repo with a new description""" + pr = repo.get_pull(pull_request_number) + pr.edit(body=pull_request_description) \ No newline at end of file