-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_and_analyze_ces.py
253 lines (224 loc) · 11.6 KB
/
query_and_analyze_ces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""This script wraps together several other scripts to accomplish the following,
* Parse CES data to extract data on a set of policies
* Query the OpenAI API to generate chat responses that simulate artificial polling of demographics across those policies
* Analyze the resulting data, comparing the GPT results to the real people from the CES data
This script has a CLI. Use `python query_and_analyze_ces.py --help` for documentation.
See the definition of `QUERY_CONFIG_SCHEMA` for guidance on the json config file format and options.
"""
from copy import deepcopy
import json
import jsonschema
from pathlib import Path
from typing import Optional
import click
import pandas as pd
import seaborn as sns
from analysis import bar_plot, hist_plot, load_gpt_data, scatter_plot
from openai_api_query import _get_model_params, OUTPUT_DIR, PromptOptionsType, main as query_main
from parse_ces import CES_DEMOS, RetainPoliciesType, get_ces_data
from utils import START_TIME, invert_dict, setup_logging
LOGGER = setup_logging('LOGS/query_and_analyze_ces')
QUERY_CONFIG_SCHEMA = {
"type" : "object",
"properties": {
"BASE_PROMPT" : {
"type" : "string",
"description": "The prompt text for the GPT queries, with python "
"formattable string placeholders matching the demographic "
"fields from PROMPT_OPTIONS.",
},
"DEMOS": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of demos; must be a subset of `CES_DEMOS.keys()`"
},
"CES_POLICIES": {
"type": "object",
"description": "Policies to pull from the CES data and query GPT for.",
"additionalProperties": {
"type": "object",
"items": {
"type": "object",
"code": {
"type": "string",
"description": "Code number for a question, used for lookups in CES data."
},
"prompt": {
"type": "string",
"description": "Prompt text used in CES survey."
}
}
}
},
"QUERY_REPEATS": {
"type": "integer",
"description": "How many prompt completions to run for each demographic bin; by default 1."
},
"N_PER_QUERY": {
"type": "integer",
"description": "How many prompt completions to run per query; by default 10."
},
"PREVIOUS_QUERY_FILE": {
"type": "string",
"description": "Optional; if specified, load a previous GPT query output from this file "
"rather than re-querying. If this points to a partially-downloaded file, then any "
"prompts not yet queried for will be completed (it will pick up querying where it "
"left off)."
},
},
"REQUIRED": [
"BASE_PROMPT",
"DEMOS",
"CES_POLICIES"
]
}
def normalize_levels(source: str) -> str:
"""Normalize policy issue response levels from CES data by making certain transformations.
"""
source = source.strip().lower()
if source == 'selected':
output = 'strongly agree'
elif source == 'not selected':
output = 'strongly disagree'
else:
output = source
return output
IssueMapType = dict[str, str | dict[str, str]]
def get_prompt_options(demos: list[str], ces_df: pd.DataFrame, ces_policies: RetainPoliciesType
) -> tuple[PromptOptionsType, IssueMapType]:
"""Extract a set of prompt options from the demographic data of a CES DataFrame.
Returns a tuple of three dicts; the first has the full prompt, suitable for GPT querying,
the second is a dictionary mapping the full prompts to their shortened names, and the third
is a mapping of issues to their low/high response levels
"""
prompt_options = {}
# Lookup demos
for demo in demos:
if ces_df[demo].dtype == 'category':
prompt_options[demo] = ces_df[demo].cat.categories.to_list()
else:
prompt_options[demo] = ces_df[demo].unique().tolist()
# Construct issues
issue_map = {ces_policies[issue]['prompt']: issue for issue in ces_policies}
# Lookup response values
issues_dicts = []
for issue in ces_policies:
an_issue = {}
an_issue['issue'] = ces_policies[issue]['prompt']
an_issue['cardinality'] = len(ces_df[issue].cat.categories)
an_issue['low_level'] = normalize_levels(ces_df[issue].cat.categories[0])
an_issue['high_level'] = normalize_levels(ces_df[issue].cat.categories[-1])
issues_dicts.append(an_issue)
prompt_options['issue'] = issues_dicts
return prompt_options, issue_map
def load_query_config(config_path: Path) -> tuple[str, list[str], RetainPoliciesType, int, int, Optional[str]]:
"""Parse the json config file and return the base_query, ces_demos, ces_policies, query_repeats, and previous_query_file
"""
with open(config_path, 'r') as f:
config = json.load(f)
jsonschema.validate(instance=config, schema=QUERY_CONFIG_SCHEMA)
for key in config['DEMOS']:
assert key in CES_DEMOS, "You requested a demo in `DEMO` that doesn't exist in CES data."
assert f'{{{key}}}' in config['BASE_PROMPT'], "You requested demo {demo}, but it does not exist in the base prompt"
if 'PREVIOUS_QUERY_FILE' not in config:
config['PREVIOUS_QUERY_FILE'] = None
else:
config['PREVIOUS_QUERY_FILE'] = Path(config['PREVIOUS_QUERY_FILE'])
if 'N_PER_QUERY' not in config:
config["N_PER_QUERY"] = 10
if 'QUERY_REPEATS' not in config:
config["QUERY_REPEATS"] = 1
return (config['BASE_PROMPT'], config['DEMOS'], config['CES_POLICIES'], config['N_PER_QUERY'], config['QUERY_REPEATS'],
config['PREVIOUS_QUERY_FILE'])
def get_data(
base_prompt: str,
demos: list[str],
ces_policies: RetainPoliciesType,
n_per_query: int,
query_repeats: int,
previous_query_file: Optional[Path | str]
) -> tuple[pd.DataFrame, pd.DataFrame, IssueMapType, dict[str, str], PromptOptionsType]:
"""Load CES and AI polling data, either from API queries or a previously saved file.
"""
LOGGER.info('Load the data')
ces_df = get_ces_data(demos=demos, policies=ces_policies)
prompt_options, issue_map = get_prompt_options(demos, ces_df, ces_policies)
issue_map_invert = invert_dict(issue_map)
outfile, _, _ = query_main(base_prompt, prompt_options, _get_model_params(n_per_query), query_repeats=query_repeats, previous_query_file=previous_query_file)
gpt_df = load_gpt_data(outfile)
return outfile, ces_df, gpt_df, issue_map, issue_map_invert, prompt_options
def transform_data(gpt_df: pd.DataFrame, ces_df: pd.DataFrame, demos: list[str], issue_map: IssueMapType, prompt_options: PromptOptionsType
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, dict[str, list[str]], dict[str, list[str]]]:
"""Transform loaded polling data to prepare for plotting.
"""
LOGGER.info('Transform the data')
gpt_df_plot = gpt_df.copy()
gpt_df_plot['issue'] = gpt_df_plot['issue'].map(issue_map)
ces_df_plot = ces_df.copy()
for issue in issue_map.values():
# Note - we add 1 to the codes so they start at 1, consistent with the prompt template
ces_df_plot[issue] = ces_df_plot[issue].cat.codes.values + 1
ces_df_plot = pd.melt(ces_df_plot, id_vars=demos, value_vars=list(issue_map.values()),
var_name='issue', value_name='score')
demos_options = {
demo: prompt_options[demo] for demo in prompt_options
if demo not in ['issue', 'cardinality', 'low_level', 'high_level']
}
combined_df = pd.concat({'gpt': gpt_df_plot, 'ces': ces_df_plot}, names=['dataset']).reset_index()
# We want to put 'dataset' first so it will show up as the bottom-level comparator in bar charts
demos_options_with_dataset = {}
demos_options_with_dataset['dataset'] = ['gpt', 'ces']
for key, val in demos_options.items():
demos_options_with_dataset[key] = val
return ces_df_plot, gpt_df_plot, combined_df, demos_options, demos_options_with_dataset
@click.command()
@click.option(
'-c', '--query_config',
required=True,
type=click.Path(),
help='Path to a json config file containing the query config matching QUERY_CONFIG_SCHEMA',
prompt=True)
def main(query_config: Path):
"""Parse CES data, query
"""
LOGGER.info('Begin query_and_analyze')
base_prompt, demos, ces_policies, n_per_query, query_repeats, previous_query_file = load_query_config(query_config)
outfile, ces_df, gpt_df, issue_map, issue_map_invert, prompt_options = get_data(
base_prompt, demos, ces_policies, n_per_query, query_repeats, previous_query_file)
out_csv = (OUTPUT_DIR / outfile.name).with_suffix('.csv.gz')
gpt_df.to_csv(out_csv)
ces_df_plot, gpt_df_plot, combined_df, demos_options, demos_options_with_dataset = transform_data(
gpt_df, ces_df, demos, issue_map, prompt_options)
LOGGER.info('Make the plots')
for issue in issue_map.values():
LOGGER.info(f'Plot the datasets alone for {issue}')
bar_plot(gpt_df_plot[gpt_df_plot['issue'] == issue], demos=demos_options, filename_prefix=f'gpt_{issue}',
title=issue_map_invert[issue], plot_func=sns.pointplot, timestamp=START_TIME)
bar_plot(ces_df_plot[ces_df_plot['issue'] == issue], demos=demos_options, filename_prefix=f'ces_{issue}',
title=issue_map_invert[issue], plot_func=sns.pointplot, timestamp=START_TIME)
LOGGER.info(f'Plot the datasets together for {issue}')
bar_plot(combined_df[combined_df['issue'] == issue], demos=demos_options_with_dataset, filename_prefix=f'combined_{issue}',
title=issue_map_invert[issue], plot_func=sns.pointplot, timestamp=START_TIME)
scatter_plot(combined_df, group_col='dataset', x_group='ces', y_group='gpt', val_col='score', filename_prefix='combined',
hue_col='issue', cats=list(demos_options.keys()), timestamp=START_TIME)
for issue in issue_map.values():
scatter_plot(combined_df[combined_df['issue'] == issue], group_col='dataset', x_group='ces', y_group='gpt', val_col='score',
filename_prefix=f'combined_ALL_{issue}', hue_col='issue', cats=list(demos_options.keys()), all_only=True,
timestamp=START_TIME)
for cat in list(demos_options.keys()):
scatter_plot(combined_df, group_col='dataset', x_group='ces', y_group='gpt', val_col='score', filename_prefix=f'combined_{cat}',
hue_col='issue', cats=[cat], timestamp=START_TIME)
for issue in issue_map.values():
scatter_plot(combined_df[combined_df['issue'] == issue], group_col='dataset', x_group='ces', y_group='gpt', val_col='score',
filename_prefix=f'combined_ALL_{issue}_{cat}', hue_col='issue', cats=[cat], all_only=True, timestamp=START_TIME)
hist_plot(combined_df, row='ideology', col='issue', filename_prefix='byissue_byideology', row_order=demos_options['ideology'],
timestamp=START_TIME)
hist_plot(combined_df, row='gender', col='issue', filename_prefix='byissue_bygender', timestamp=START_TIME)
for issue in issue_map.values():
hist_plot(combined_df[combined_df['issue'] == issue], row=None, col='ideology', filename_prefix=issue,
col_order=demos_options['ideology'], timestamp=START_TIME)
LOGGER.info('End query_and_analyze')
if __name__ == '__main__':
main()