Clark Boylan e6dedce81c Add bulk processing and csv output
Running this command for each different project and metric gets old
particularly when you have to input the password each time. Update the
script to collect all metrics for a list of projects. Then to make that
more useful add support for csv output.

Change-Id: Id5ee94e046e11813387ad0d3ae4a9a2e8490062d
2025-04-01 09:34:41 -07:00

328 lines
11 KiB
Python

# Copyright OpenDev Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.
import argparse
import copy
import datetime
import getpass
import math
# Install opensearch-py
from opensearchpy import OpenSearch
# Base query parameters to get human authored changes
BASE_CHANGES_QUERY = {
"query": {
"bool": {
# Should is logical OR
"should": [
],
# Must is logical AND
"must": [
],
# NOT(A OR B)
"must_not": [
],
"filter": [
{
"term": {
"type": "changeset"
}
},
{
"term": {
"author_bot": False
}
},
]
}
},
}
CLOSED_STATUS = [
{
"term": {
"status": "MERGED"
}
},
{
"term": {
"status": "ABANDONED"
}
},
]
MERGED_STATUS = [
{
"term": {
"status": "MERGED"
}
},
]
def set_date_range(start_date, end_date):
date_range = {
"range": {
"grimoire_creation_date": {
"gte": start_date,
"lte": end_date,
"format": "strict_date_optional_time"
}
}
}
return date_range
def calculate_review_efficiency(client, project_name, start_date, end_date):
date_range = set_date_range(start_date, end_date)
project_term = {
"term": {
"project": project_name
}
}
closed_query = copy.deepcopy(BASE_CHANGES_QUERY)
closed_query["query"]["bool"]["should"] = CLOSED_STATUS
closed_query["query"]["bool"]["filter"].append(date_range)
closed_query["query"]["bool"]["must"].append(project_term)
closed_query["track_total_hits"] = True
closed_query["size"] = 0
open_query = copy.deepcopy(BASE_CHANGES_QUERY)
open_query["query"]["bool"]["must_not"] = CLOSED_STATUS
open_query["query"]["bool"]["filter"].append(date_range)
open_query["query"]["bool"]["must"].append(project_term)
open_query["track_total_hits"] = True
open_query["size"] = 0
r = client.search(index='openstack_gerrit', body=closed_query)
closed_total = r['hits']['total']['value']
r = client.search(index='openstack_gerrit', body=open_query)
open_total = r['hits']['total']['value']
rei = float(closed_total / open_total)
return rei, None
def calculate_merge_time(client, project_name, start_date, end_date):
date_range = set_date_range(start_date, end_date)
project_term = {
"term": {
"project": project_name
}
}
closed_query = copy.deepcopy(BASE_CHANGES_QUERY)
closed_query["query"]["bool"]["should"] = MERGED_STATUS
closed_query["query"]["bool"]["filter"].append(date_range)
closed_query["query"]["bool"]["must"].append(project_term)
closed_query["size"] = 100
times_to_merge = []
r = client.search(index='openstack_gerrit', scroll='1m', body=closed_query)
hits = r['hits']['hits']
while hits:
for hit in hits:
open_date = datetime.datetime.fromisoformat(
hit["_source"]["grimoire_creation_date"])
close_date = datetime.datetime.fromisoformat(
hit["_source"]["last_updated"])
times_to_merge.append(close_date - open_date)
# Beware normal pagination. Results are not consistent.
# Use scroll instead
r = client.scroll(scroll_id=r["_scroll_id"], scroll="1m")
hits = r['hits']['hits']
times_sorted = sorted(times_to_merge)
middle = math.floor(len(times_to_merge) / 2)
median = times_sorted[middle]
average_seconds = \
sum(map(lambda x: x.total_seconds(), times_sorted)) / len(times_sorted)
average = datetime.timedelta(seconds=average_seconds)
return median, average
def calculate_time_to_review(client, project_name, start_date, end_date):
date_range = set_date_range(start_date, end_date)
project_term = {
"term": {
"project": project_name
}
}
closed_query = copy.deepcopy(BASE_CHANGES_QUERY)
closed_query["query"]["bool"]["should"] = MERGED_STATUS
closed_query["query"]["bool"]["filter"].append(date_range)
closed_query["query"]["bool"]["must"].append(project_term)
closed_query["size"] = 100
times_to_review = []
r = client.search(index='openstack_gerrit', scroll='1m', body=closed_query)
hits = r['hits']['hits']
while hits:
for hit in hits:
# Note time_to_first_review appears to be storing a float count
# of the number of days to the first review. This is an odd
# way to store the value so I'm documenting it here.
time_to_first_review = hit["_source"]["time_to_first_review"]
if time_to_first_review:
# We can apparently get None values back. Ignore them.
times_to_review.append(time_to_first_review)
# Beware normal pagination. Results are not consistent.
# Use scroll instead
r = client.scroll(scroll_id=r["_scroll_id"], scroll="1m")
hits = r['hits']['hits']
times_sorted = sorted(times_to_review)
middle = math.floor(len(times_to_review) / 2)
median = datetime.timedelta(days=times_sorted[middle])
average = datetime.timedelta(days=sum(times_sorted) / len(times_sorted))
return median, average
def calculate_patchset_per_review(client, project_name, start_date, end_date):
date_range = set_date_range(start_date, end_date)
project_term = {
"term": {
"project": project_name
}
}
closed_query = copy.deepcopy(BASE_CHANGES_QUERY)
closed_query["query"]["bool"]["should"] = CLOSED_STATUS
closed_query["query"]["bool"]["filter"].append(date_range)
closed_query["query"]["bool"]["must"].append(project_term)
closed_query["size"] = 100
patchsets_list = []
r = client.search(index='openstack_gerrit', scroll='1m', body=closed_query)
hits = r['hits']['hits']
while hits:
for hit in hits:
patchsets = hit["_source"]["patchsets"]
patchsets_list.append(patchsets)
# Beware normal pagination. Results are not consistent.
# Use scroll instead
r = client.scroll(scroll_id=r["_scroll_id"], scroll="1m")
hits = r['hits']['hits']
patchsets_sorted = sorted(patchsets_list)
middle = math.floor(len(patchsets_list) / 2)
median = patchsets_list[middle]
average = sum(patchsets_sorted) / len(patchsets_sorted)
return median, average
QUERIES = {
"rei": calculate_review_efficiency,
"time-to-merge": calculate_merge_time,
"time-to-review": calculate_time_to_review,
"patchset-per-review": calculate_patchset_per_review,
}
def gather_metrics(client, args):
if args.csv:
print("metric,project,starttime,endtime,value")
projects = [p for p in args.project.split(',') if p]
if args.query == "ALL":
queries = QUERIES.items()
else:
queries = [(args.query, QUERIES[args.query])]
for query, func in queries:
for project in projects:
median, average = func(
client, project, args.start_date, args.end_date)
if args.csv:
if isinstance(median, datetime.timedelta):
median = median.total_seconds()
if isinstance(average, datetime.timedelta):
average = average.total_seconds()
if not average:
# Some values are singletons overload use of median
print("%s,%s,%s,%s,%s" %
(query, project,
args.start_date, args.end_date, median))
else:
print("median-%s,%s,%s,%s,%s" %
(query, project,
args.start_date, args.end_date, median))
print("average-%s,%s,%s,%s,%s" %
(query, project,
args.start_date, args.end_date, average))
else:
if not average:
# Some values are singletons overload use of median
print("%s %s %s to %s: %s" %
(project, query,
args.start_date, args.end_date, median))
else:
print("%s median %s %s to %s: %s" %
(project, query,
args.start_date, args.end_date, median))
print("%s average %s %s to %s: %s" %
(project, query,
args.start_date, args.end_date, average))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--host",
help="Bitergia opensearch host to connect to",
default="openstack.biterg.io")
parser.add_argument("--port",
help="Bitergia opensearch port to connect to",
type=int, default=443)
parser.add_argument("--url-prefix",
help="Bitergia opensearch url prefix to connect to",
default='data')
parser.add_argument("--user",
help="Bitergia opensearch username", required=True)
parser.add_argument("--query",
help="Metric to query",
default='ALL')
parser.add_argument("--csv", help="Emit csv output", action="store_true")
parser.add_argument("project",
help="Project to filter results for. This must "
"match bitergia's idea of a project name. May "
"be a comma separated list.")
parser.add_argument("start_date",
help="Start date for results. "
"eg 2025-01-01T00:00:00.000Z")
parser.add_argument("end_date",
help="End date for results. "
"eg 2025-01-01T00:00:00.000Z")
args = parser.parse_args()
host = args.host
port = args.port
url_prefix = args.url_prefix
passwd = getpass.getpass('Password: ')
auth = (args.user, passwd)
# indexes appear to be openstack_git, openstack_gerrit, openstack_mbox
client = OpenSearch(
hosts=[{'host': host, 'port': port}],
url_prefix=url_prefix,
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
use_ssl=True,
verify_certs=True,
)
gather_metrics(client, args)
if __name__ == "__main__":
main()