scripts/fetch_projects.py

# Copyright 2018 Twitter, Inc. # SPDX-License-Identifier: Apache-2.0 import os import operator import json import requests import graphql_queries PATH_TO_DATA = "data" GITHUB_USERNAME = os.environ["GH_USERNAME"] GITHUB_OAUTH_TOKEN = os.environ["OAUTH_TOKEN"] GITHUB_API_ENDPOINT = "https://api.github.com/graphql" print("LOG: Assuming the current path to be the root of the metrics repository.") SVG_NO_OF_MEMBERS = 'N/A' SVG_NO_OF_REPOS = 'N/A' def fetch_one_page(query_string, variables): """ Request the GitHub GraphQL API """ headers = { "Content-Type": "application/json", } r = requests.post(GITHUB_API_ENDPOINT, json={"query": query_string, "variables": variables}, auth=(GITHUB_USERNAME, GITHUB_OAUTH_TOKEN)) if r.status_code == 200: return r.json() else: raise Exception("Error in GitHub API query. Status Code : {}, Response: {}".format(r.status_code, r.json())) all_org_edges = [] # All the repos in the org with their stats # Read repos-to-include.txt all_orgs = [] # Track orgs and all its repos e.g. twitter, twitter all_repos = [] # Track specific repositories e.g. ('pantsbuild', 'pants') with open("repos-to-include.txt", "r") as f: for line in f: owner, repo = line.split("/") repo = repo.rstrip("\n") if repo == "*": all_orgs.append(owner) else: all_repos.append((owner, repo)) print("LOG: Orgs to track", all_orgs) print("Repos to track", all_repos) for org in all_orgs: # Combine the paginated responses from the API has_next_page = False end_cursor = None num_of_pages = 0 while True: print("Num of pages", num_of_pages) variables = json.dumps({"owner": org, "endCursor": end_cursor}) print("Sending request for", org) response = fetch_one_page(graphql_queries.org_all_repos, variables) print("Received request for", org) if org == 'twitter': SVG_NO_OF_MEMBERS = response["data"]["organization"]["membersWithRole"]["totalCount"] repository_edges = response["data"]["organization"]["repositories"]["edges"] all_org_edges.extend(repository_edges) pageInfo = response["data"]["organization"]["repositories"]["pageInfo"] has_next_page = pageInfo["hasNextPage"] print("has_next_page", has_next_page) end_cursor = pageInfo["endCursor"] print("end_cursor", end_cursor) num_of_pages += 1 if not has_next_page: break print("LOG: Fetched all the org repositories. Count:", len(all_org_edges)) # print("LOG: First record") # print(all_org_edges[0]) # Fetch individual repositories' data all_repo_edges = [] # All individual repos for repo in all_repos: variables = json.dumps({"owner": repo[0], "repo": repo[1], "endCursor": None}) response = fetch_one_page(graphql_queries.repo_wise, variables) all_repo_edges.append(response["data"]) print("LOG: Fetched all the individual repos as well. Count:", len(all_repo_edges)) # Repos to exclude repos_to_exclude = set() with open("repos-to-exclude.txt", "r") as f: for line in f: repo = line.rstrip("\n") repos_to_exclude.add(repo) print("LOG: Removing private repositories") public_repos = [] for edge in all_org_edges: if not edge["node"]["isPrivate"]: public_repos.append(edge) for edge in all_repo_edges: if not edge["repository"]["isPrivate"]: public_repos.append({'node': edge['repository']}) SVG_NO_OF_REPOS = len(public_repos) print("LOG: Number of public repos", len(public_repos)) DATA_JSON = {} for repo in public_repos: repo_full_name = repo["node"]["nameWithOwner"] if repo_full_name in repos_to_exclude: print("LOG: Excluding", repo_full_name) continue DATA_JSON[repo_full_name] = repo["node"] # Flatten list of languages languages_dict = {} for item in DATA_JSON[repo_full_name]["languages"]["edges"]: languages_dict[item["node"]["name"]] = item["size"] total_bytes = sum(languages_dict.values()) for lang in languages_dict: languages_dict[lang] /= total_bytes # This is got to be a float, so use Python 3 # Use languages which have more than 5% code languages = [] for item, value in languages_dict.items(): if value > 0.05: languages.append(item) DATA_JSON[repo_full_name]["languages"] = " ".join(languages) # Flatten list of repository topics _topics = DATA_JSON[repo_full_name]["repositoryTopics"]["edges"] topics = [] for item in _topics: topics.append(item["node"]["topic"]["name"]) DATA_JSON[repo_full_name]["repositoryTopics"] = " ".join(topics) # Flatten stars count and watch count DATA_JSON[repo_full_name]["stargazers"] = DATA_JSON[repo_full_name]["stargazers"]["totalCount"] DATA_JSON[repo_full_name]["watchers"] = DATA_JSON[repo_full_name]["watchers"]["totalCount"] # Save to _data directory file_path = PATH_TO_DATA + "/" + "projects.json" with open(file_path, "w+") as f: json.dump(DATA_JSON, f, sort_keys=True, indent=2) print("LOG: Saved to", file_path) # Update the SVG print("No of members", SVG_NO_OF_MEMBERS) print("No of repos", SVG_NO_OF_REPOS) network_svg = open("static/assets/network_raw.svg").read() network_svg = network_svg.replace("{$members}", str(SVG_NO_OF_MEMBERS)) network_svg = network_svg.replace("{$Repos}", str(SVG_NO_OF_REPOS)) with open("static/assets/network.svg", "w+") as f: f.write(network_svg) print("LOG: assets/network.svg updated!")

scripts/fetch_projects.py (115 lines of code) (raw):