-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun.py
More file actions
executable file
·148 lines (115 loc) · 5.08 KB
/
run.py
File metadata and controls
executable file
·148 lines (115 loc) · 5.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
import yaml
import json
import argparse
from pathlib import Path
from longguide import MetricsGuidelines, OutputConstraintsGuidelines
from longguide.llm_client import LLMClient
from standardize_data import standardize_dataset
def get_task_instruction(data_path):
"""Get dataset-specific instruction"""
dataset_name = Path(data_path).name
instructions = {
"SWiPE": "Simplify this text.",
"SAMSum": "Summarize the following dialogue.",
"CNN": "Summarize the following news.",
"xlsum": "Summarize the following document.",
"IWSLT": "Translate the following from English to Japanese.",
"CommonGen": "Generate the text from the following table.",
"SyntheticDialogue": "Generate the next dialogue response."
}
return instructions.get(dataset_name, "Process this text.")
def get_data_path(task_type):
"""Map task type to dataset path"""
# summarization, translation, dialogue generation, table-to-text generation, text simplification
task_to_dataset = {
"summarization": "data/SAMSum", # Can be CNN/xlsum/SAMSum
"translation": "data/IWSLT",
"text simplification": "data/SWiPE",
"table-to-text generation": "data/CommonGen",
"dialogue generation": "data/SyntheticDialogue",
}
return task_to_dataset.get(task_type, "data/SAMSum")
def load_config(config_path):
"""Load configuration from YAML file"""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def load_dataset(data_path):
"""Load and standardize dataset from JSON files"""
data_dir = Path(data_path)
dataset_name = data_dir.name
data = []
for json_file in data_dir.glob("*.json"):
with open(json_file, 'r') as f:
file_data = json.load(f)
data.extend(file_data)
# Use existing standardization
temp_file = f"temp_{dataset_name}.json"
with open(temp_file, 'w') as f:
json.dump(data, f)
standardized_data = standardize_dataset(dataset_name, temp_file)
Path(temp_file).unlink()
return standardized_data
def run_longguide(config_path):
"""Run LongGuide with specified configuration"""
config = load_config(config_path)
# Auto-determine data path from task type
data_path = get_data_path(config['task_type'])
# Load dataset
dataset = load_dataset(data_path)
print(f"Loaded {len(dataset)} examples from {data_path}")
# Initialize guidelines with task type and config
metrics_guidelines = MetricsGuidelines(config['task_type'], config)
constraints_guidelines = OutputConstraintsGuidelines(config['task_type'], config)
# Initialize LLM client for generation
llm_client = LLMClient(config['model_name'], config['api_key'])
# Get task-specific guidelines
metrics = metrics_guidelines.get_guidelines()
constraints = constraints_guidelines.get_guidelines(dataset[:10]) # Test with first 10 examples
print(f"Using guidelines for task: {config['task_type']}")
print(f"Metrics: {metrics}")
print(f"Constraints: {constraints}")
# Process dataset
results = []
for i, item in enumerate(dataset):
print(f"Processing example {i+1}/{len(dataset)}")
input_text = item['input']
task_instruction = get_task_instruction(data_path)
# Full attributes prompt
full_prompt = f"""{task_instruction} Your generated output must strictly fulfill the following task metrics. {constraints}
{metrics}
Input: {input_text}"""
# Only metrics prompt
only_metrics_prompt = f"""{task_instruction} Your generated output must strictly fulfill the following task metrics.
{metrics}
Input: {input_text}"""
# Only constraints prompt
only_constraints_prompt = f"""{task_instruction} {constraints}
Input: {input_text}"""
# Generate outputs using LLM
full_attributes_output = llm_client.generate(full_prompt)
only_metrics_output = llm_client.generate(only_metrics_prompt)
only_constraints_output = llm_client.generate(only_constraints_prompt)
results.append({
'input': input_text,
'target': item['output'],
'full_attributes_prompt': full_prompt,
'only_metrics_prompt': only_metrics_prompt,
'only_constraints_prompt': only_constraints_prompt,
'full_attributes': full_attributes_output,
'only_metrics': only_metrics_output,
'only_constraints': only_constraints_output
})
# Save results
output_path = f"outputs/results_{config['task_type']}.json"
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"Results saved to {output_path}")
def main():
parser = argparse.ArgumentParser(description='Run LongGuide with configuration')
parser.add_argument('--config', default='configs/default.yaml',
help='Path to configuration file')
args = parser.parse_args()
run_longguide(args.config)
if __name__ == "__main__":
main()