datahelp/excel2json.py at main · MoserMichael/datahelp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pandas as pd
import argparse
import sys
import re
import json
import math

def parse_cmd_line():
    usage = '''
Extract a table from an excel file into a json file.
The json file is an array of records, where each row stands for a record in the json.
The row-record consists of name-value pairs, where the name is the table header and the value is the cell value for this row.

The initial position is the start of he header line.
All data lines are taken, until end of excel, or until the first line that does not have any values

Example:

python3 excel2json.py --excel=tbl.xlsx --col_from=3 --row_from=1 --json=out.json

Extract the table where the header line starts from column 3 (one is the first column) and row 1 (one is the first column)

python3 excel2json.py --excel=tbl.xlsx  --json=out.json


'''

    parse = argparse.ArgumentParser(description=usage)

    parse.add_argument('--excel',
                       '-i',
                       required=True,
                       type=str,
                       dest='excel_file',
                       help='file name of excel input file')

    parse.add_argument('--tab',
                       '-t',
                       type=str,
                       dest='excel_tab',
                       help='tab name of excel tab')

    parse.add_argument('--json',
                       '-o',
                       required=True,
                       type=str,
                       dest='out_file',
                       help='output file file')

    parse.add_argument('--row_from',
                       '-x',
                       required=True,
                       type=int,
                       dest='row_from',
                       help='starting row of range (one based)')

    parse.add_argument('--col_from',
                       '-y',
                       required=True,
                       type=str,
                       dest='col_from',
                       help='starting column of range (one based or excel characte notation)')

    parse.add_argument('--filter',
                       '-f',
                       required=False,
                       default="",
                       type=str,
                       dest='use_columns',
                       help='filter a subset of column (comma delimited list of column names)')


    return parse.parse_args(), parse

def err(msg):
    print(f"Error: {msg}")
    sys.exit(1)

def excel_num_to_idx(arg):
    res = 0
    for ch in arg.lower():
        if ch.isalpha() and not ch.isdigit():
            res = res * 26 + (ord(ch) - ord('a') + 1)
        else:
            err("parameter {arg} should be all letters (excel convention for column positio) or all digits (one based offset)")
    return res


def check_vals(arg):

    if arg.col_from.isdigit():
        arg.col_from = int(arg.col_from)
    else:
        arg.col_from = excel_num_to_idx(arg.col_from)

    if arg.col_from <= 0:
        err("positive (greater equal to one) value for --col_from expected")

    if arg.row_from <= 0:
        err("positive (greater equal to one) value for --row_from expected")

    if arg.use_columns == "":
        return []
    return list(map(lambda arg : arg.strip(), arg.use_columns.split(",")))

def parse_header(df, y, x):
    out_header = []

    num_columns = df.shape[1]
    #print(f"shape: {df.shape}")

    #num_rows = df.shape[0]

    x_cur = x
    while True:
        cell = df.iat[y, x_cur]

        s_val = ""
        if isinstance(cell,str):
            s_val = str(cell).strip()
        elif isinstance(cell, int) or isinstance(cell, pd.StringDtype):
            s_val = str(cell)
        elif isinstance(cell, float):
            if not math.isinf(cell):
                s_val = str(cell)

        if s_val == "":
            break

        s_val = s_val.replace(' ', '-')

        out_header.append(s_val)
        x_cur += 1

        if x_cur >= num_columns:
            break

    return out_header

def process(arg, prs):
    filter_columns = check_vals(arg)

    if arg.excel_tab is not None:
        df = pd.read_excel(arg.excel_file, sheet_name=arg.excel_tab, header=None,keep_default_na=False)
    else:
        df = pd.read_excel(arg.excel_file,header=None,keep_default_na=False)

    header_names = parse_header(df, arg.row_from-1, arg.col_from-1)
    print(f"table headers: {header_names}")

    num_rows = df.shape[0]

    json_data = []
    y = arg.row_from
    x = arg.col_from - 1
    while True:
        if y >= num_rows:
            break
        row_entry = {}
        all_empty_vals = True
        for x_cur in range(0, len(header_names)):
            cell = df.iat[y, x+x_cur]

            s_val = ""
            if isinstance(cell,str) or isinstance(cell, int) or isinstance(cell, pd.StringDtype):
                s_val = str(cell)
            elif isinstance(cell, float):
                if not math.isinf(cell):
                    s_val = str(cell)

            if len(filter_columns) != 0 and not header_names[x_cur] in filter_columns:
                print(f"skippping '{header_names[x_cur]}' {filter_columns}")
                continue

            if s_val != "":
                all_empty_vals = False

            row_entry[ header_names[x_cur] ] = s_val

        if all_empty_vals:
                break

        json_data.append(row_entry)

        y += 1

    # save it
    out_data = json.dumps(json_data, indent=4)
    with open(arg.out_file, 'w') as out_file:
        out_file.write(out_data)

def main():
    arg,prs = parse_cmd_line()
    process(arg, prs)

main()