# Copyright © 2020 Interplanetary Database Association e.V.,
# Planetmint and IPDB software contributors.
# SPDX-License-Identifier: (Apache-2.0 AND CC-BY-4.0)
# Code is Apache-2.0 and docs are CC-BY-4.0

"""
A little Python script to do some analysis of the NGINX logs.
To get the relevant NGINX logs:
1. Go to the OMS Portal
2. Create a new Log Search
3. Use a search string such as:

Type=ContainerLog Image="planetmint/nginx_3scale:1.3" GET NOT("Go-http-client") NOT(runscope)

(This gets all logs from the NGINX container, only those with the word "GET",
excluding those with the string "Go-http-client" [internal Kubernetes traffic],
excluding those with the string "runscope" [Runscope tests].)

4. In the left sidebar, at the top, use the dropdown menu to select the time range,
e.g. "Data based on last 7 days". Pay attention to the number of results and
the time series chart in the left sidebar. Are there any spikes?
5. Export the search results. A CSV file will be saved on your local machine.
6. $ python3 analyze.py logs.csv

Thanks to https://gist.github.com/hreeder/f1ffe1408d296ce0591d
"""

import sys
import csv
import re
from dateutil.parser import parse


lineformat = re.compile(
    r"(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - "
    r"\[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} "
    r'(\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) '
    r"(?P<statuscode>\d{3}) "
    r"(?P<bytessent>\d+) "
    r'(["](?P<refferer>(\-)|(.+))["]) '
    r'(["](?P<useragent>.+)["])',
    re.IGNORECASE,
)

filepath = sys.argv[1]

logline_list = []
with open(filepath) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    for row in csvreader:
        if row and (row[8] != "LogEntry"):
            # because the first line is just the column headers, such as 'LogEntry'
            logline = row[8]
            print(logline + "\n")
            logline_data = re.search(lineformat, logline)
            if logline_data:
                logline_dict = logline_data.groupdict()
                logline_list.append(logline_dict)
                # so logline_list is a list of dicts
                # print('{}'.format(logline_dict))

# Analysis

total_bytes_sent = 0
tstamp_list = []

for lldict in logline_list:
    total_bytes_sent += int(lldict["bytessent"])
    dt = lldict["dateandtime"]
    # https://tinyurl.com/lqjnhot
    dtime = parse(dt[:11] + " " + dt[12:])
    tstamp_list.append(dtime.timestamp())

print("Number of log lines seen: {}".format(len(logline_list)))

# Time range
trange_sec = max(tstamp_list) - min(tstamp_list)
trange_days = trange_sec / 60.0 / 60.0 / 24.0
print("Time range seen (days): {}".format(trange_days))

print("Total bytes sent: {}".format(total_bytes_sent))

print("Average bytes sent per day (out via GET): {}".format(total_bytes_sent / trange_days))