diff --git a/cdf_compare.py b/cdf_compare.py new file mode 100755 index 0000000..158a56f --- /dev/null +++ b/cdf_compare.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +import multiprocessing +import os +import pickle +from argparse import ArgumentParser +from math import ceil +from time import sleep + +import matplotlib +import pandas as pd +import matplotlib.pyplot as plt +from mpl_toolkits import axisartist +from mpl_toolkits.axes_grid1 import host_subplot + + + +def csv_to_dataframe(csv_list, dummy): + + global n + global frame_list + + transmission_df = None + + for csv in csv_list: + tmp_df = pd.read_csv( + "{}{}".format(args.pcap_csv_folder, csv), + dtype=dict(is_retranmission=bool, is_dup_ack=bool), + ) + tmp_df["datetime"] = pd.to_datetime(tmp_df["datetime"]) - pd.Timedelta(hours=1) + tmp_df = tmp_df.set_index("datetime") + tmp_df.index = pd.to_datetime(tmp_df.index) + if transmission_df is None: + transmission_df = tmp_df + else: + transmission_df = pd.concat([transmission_df, tmp_df]) + + n.value += 1 + + frame_list.append(transmission_df) + + +from itertools import islice + +def chunk(it, size): + it = iter(it) + return iter(lambda: tuple(islice(it, size)), ()) + + +def plot_cdf(dataframe, column_name): + stats_df = dataframe \ + .groupby(column_name) \ + [column_name] \ + .agg("count") \ + .pipe(pd.DataFrame) \ + .rename(columns={column_name: "frequency"}) + + # PDF + stats_df["PDF"] = stats_df["frequency"] / sum(stats_df["frequency"]) + + # CDF + stats_df["CDF"] = stats_df["PDF"].cumsum() + stats_df = stats_df.reset_index() + + stats_df.plot(x=column_name, y=["CDF"], grid=True) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--serial1", required=True, help="Serial csv file1.") + parser.add_argument("--serial2", required=True, help="Serial csv file2.") + parser.add_argument("--folder1", required=True, help="PCAP csv folder1.") + parser.add_argument("--folder2", required=True, help="PCAP csv folder2.") + parser.add_argument("--save", default=None, help="Location to save pdf file.") + parser.add_argument( + "-c", + "--cores", + default=1, + type=int, + help="Number of cores for multiprocessing.", + ) + parser.add_argument( + "-i", + "--interval", + default=2, + type=int, + help="Time interval for rolling window.", + ) + + args = parser.parse_args() + + transmission_df_list = list() + for f in [args.folder1, args.folder2]: + manager = multiprocessing.Manager() + n = manager.Value("i", 0) + frame_list = manager.list() + jobs = [] + + # load all pcap csv into one dataframe + pcap_csv_list = list() + for filename in os.listdir(args.pcap_csv_folder): + if filename.endswith(".csv") and "tcp" in filename: + pcap_csv_list.append(filename) + + parts = chunk(pcap_csv_list, ceil(len(pcap_csv_list) / args.cores)) + print("Start processing with {} jobs.".format(args.cores)) + for p in parts: + process = multiprocessing.Process(target=csv_to_dataframe, args=(p, "dummy")) + jobs.append(process) + + for j in jobs: + j.start() + + print("Started all jobs.") + # Ensure all the processes have finished + finished_job_counter = 0 + working = ["|", "/", "-", "\\", "|", "/", "-", "\\"] + w = 0 + while len(jobs) != finished_job_counter: + sleep(1) + print( + "\r\t{}{}{}\t Running {} jobs ({} finished). Processed {} out of {} pcap csv files. ({}%) ".format( + working[w], + working[w], + working[w], + len(jobs), + finished_job_counter, + n.value, + len(pcap_csv_list), + round((n.value / len(pcap_csv_list)) * 100, 2), + ), + end="", + ) + finished_job_counter = 0 + for j in jobs: + if not j.is_alive(): + finished_job_counter += 1 + if (w + 1) % len(working) == 0: + w = 0 + else: + w += 1 + print("\r\nSorting table...") + + transmission_df = pd.concat(frame_list) + frame_list = None + transmission_df = transmission_df.sort_index() + + print("Calculate goodput...") + + transmission_df["srtt"] = transmission_df["srtt"].apply(lambda x: x / 10 ** 6) + + # key for columns and level for index + transmission_df["goodput"] = transmission_df["payload_size"].groupby(pd.Grouper(level="datetime", freq="{}s".format(args.interval))).transform("sum") + transmission_df["goodput"] = transmission_df["goodput"].apply( + lambda x: ((x * 8) / args.interval) / 10**6 + ) + + transmission_df["goodput_rolling"] = transmission_df["payload_size"].rolling("{}s".format(args.interval)).sum() + transmission_df["goodput_rolling"] = transmission_df["goodput_rolling"].apply( + lambda x: ((x * 8) / args.interval) / 10 ** 6 + ) + + # set meta values + cc_algo = transmission_df["congestion_control"].iloc[0] + cc_algo = cc_algo.upper() + transmission_direction = transmission_df["direction"].iloc[0] + + # read serial csv + serial_df = pd.read_csv(args.serial_file) + serial_df["datetime"] = pd.to_datetime(serial_df["datetime"]) - pd.Timedelta(hours=1) + serial_df = serial_df.set_index("datetime") + serial_df.index = pd.to_datetime(serial_df.index) + serial_df.sort_index() + + transmission_df = pd.merge_asof( + transmission_df, + serial_df, + tolerance=pd.Timedelta("1s"), + right_index=True, + left_index=True, + ) + + transmission_df_list.append(dict( + df=transmission_df, + cc_algo=cc_algo, + transmission_direction=transmission_direction + )) + + # Plot sRTT CDF + plot_cdf(transmission_df_list[0]["df"], "srtt") + plot_cdf(transmission_df_list[1]["df"], "srtt") + plt.xscale("log") + plt.xlabel("sRTT [s]") + plt.ylabel("CDF") + plt.legend([transmission_df_list[0]["cc_algo"], transmission_df_list[1]["cc_algo"]]) + plt.title("{}".format(transmission_direction)) + plt.savefig("{}{}_cdf_compare_plot.pdf".format(args.save, "srtt")) + + plt.clf() + + # Plot goodput CDF + plot_cdf(transmission_df_list[0]["df"], "goodput") + plot_cdf(transmission_df_list[1]["df"], "goodput") + plt.xlabel("goodput [mbps]") + plt.ylabel("CDF") + plt.legend([transmission_df_list[0]["cc_algo"], transmission_df_list[1]["cc_algo"]]) + plt.title("{}".format(transmission_direction)) + plt.savefig("{}{}_cdf_compare_plot.pdf".format(args.save, "goodput")) \ No newline at end of file