microsoft · rosscutler · Jun 4, 2025
diff --git a/docs/results.md b/docs/results.md
@@ -46,7 +46,8 @@ created in the first step ([preparation](preparation.md)).
     for each assignment which has a status of "submitted". 
     * `[downloaded_batch_result]_votes_per_clip.csv`: Aggregated result per clip, including MOS, standard deviations, and 95% Confidence Intervals.  
     * `[downloaded_batch_result]_votes_per_cond.csv`: Aggregated result per condition.
-    * `[downloaded_batch_result]_votes_per_worker.csv`: Long format of rating per clip, includes: HITId, workerid, file, vote and condition.
+    * `[downloaded_batch_result]_votes_per_worker.csv`: Long format of rating per clip, includes: HITId, workerid, file, vote and condition.
+    * `[downloaded_batch_result]_aggregated_results.csv`: Aggregated MOS statistics per condition keys with 95% confidence intervals.
     * `[downloaded_batch_result]_quantity_bonus_report.csv`: List of workers who are eligible for quantity bonus with the amount of bonus (to be used with the mturk_utils.py).
     * `[downloaded_batch_result]_quality_bonus_report.csv`: List of workers who are eligible for quality bonus with the amount of bonus (to be used with the mturk_utils.py).
     * `[downloaded_batch_result]_extending.csv`: List of HITIds with number of assignment per each which are needed to reach a specific number of votes per clip. 

diff --git a/src/result_parser.py b/src/result_parser.py
@@ -1659,6 +1659,36 @@ def number_of_uniqe_workers(answers):
     return len(df)
 
 
+def aggregate_condition_results(votes, cfg, out_path):
+    """Aggregate votes by condition keys and question type.
+
+    Parameters
+    ----------
+    votes : list[dict]
+        List of vote entries generated during parsing.
+    cfg : configparser.ConfigParser
+        Configuration to read ``condition_keys``.
+    out_path : str
+        Path of the CSV file to create.
+    """
+
+    if len(votes) == 0:
+        return
+
+    df = pd.DataFrame(votes)
+    group_keys = []
+    if cfg.has_option('general', 'condition_keys'):
+        group_keys.extend([k.strip() for k in cfg['general']['condition_keys'].split(',')])
+
+    if 'question_type' in df.columns:
+        group_keys.append('question_type')
+
+    agg = df.groupby(group_keys)['vote'].agg(['count', 'mean', 'std']).reset_index()
+    agg.rename(columns={'count': 'n', 'mean': 'MOS', 'std': 'std'}, inplace=True)
+    agg['95%CI'] = 1.96 * agg['std'] / np.sqrt(agg['n'])
+    agg.to_csv(out_path, index=False)
+
+
 def get_ans_suffixes(test_method):
     if "p835" in test_method:
         question_name_suffix = p835_suffixes[2]
@@ -1900,6 +1930,10 @@ def analyze_results(config, test_method, answer_path, list_of_req, quality_bonus
             )
     write_dict_as_csv(all_data_per_worker, all_votes_per_file_path)
 
+    # aggregated analysis report
+    agg_path = os.path.splitext(answer_path)[0] + '_aggregated_results.csv'
+    aggregate_condition_results(all_data_per_worker, config, agg_path)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(