This repository has been archived by the owner on Dec 2, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextended_csv.py
134 lines (113 loc) · 4.1 KB
/
extended_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import csv
from itertools import islice
from pathlib import Path
from typing import IO, Dict, Iterable, List, Union
from utils.dict import select_not_null
__all__ = ['read_xsv', 'read_xsv_file', 'get_dialect_from_suffix']
_FORMAT_DIALECT = {
'csv': 'excel',
'tsv': 'excel-tab',
'.csv': 'excel',
'.tsv': 'excel-tab',
}
def get_dialect_from_suffix(suffix: str) -> str:
"""Returns `csv` module dialect given file ending.
Examples:
>>> get_dialect_from_suffix('.csv')
'excel'
>>> get_dialect_from_suffix('.tsv')
'excel-tab'
>>> get_dialect_from_suffix('csv')
'excel'
>>> get_dialect_from_suffix('tsv')
'excel-tab'
"""
try:
return _FORMAT_DIALECT[suffix]
except KeyError:
raise ValueError(f"Unrecognized file format: '{suffix}'") from None
def read_xsv_file(filename: Union[str, Path],
dialect: str,
*,
encoding: str = None,
fieldnames: List[str] = None,
first_line_is_column_header: bool = True,
discard: int = None,
load_at_most: int = None,
) -> List[Dict]:
"""Returns a list of dicts. Convenience method for `read_xsv`.
Args:
filename:
The filename to open.
dialect:
As used in built-in module `csv`.
encoding:
Encoding of the file to open.
fieldnames:
TODO: Pending documentation for 'fieldnames'
first_line_is_column_header:
If True, parses first line as column headers.
discard:
Non-negative integer or None. Initial rows of _data_ to discard.
load_at_most:
Non-negative integer or None. Rows of _data_ to load.
Notes:
Use 'excel' dialect for CSV. Use 'excel-tab' for TSV.
"""
kwargs = {
'encoding': encoding,
'fieldnames': fieldnames,
'first_line_is_column_header': first_line_is_column_header,
'discard': discard,
'load_at_most': load_at_most,
}
with open(filename, 'r', **select_not_null(kwargs, 'encoding')) as file:
kwargs.pop('encoding')
# must iterated now because file will be closed
return list(read_xsv(file, dialect, **select_not_null(kwargs)))
def read_xsv(file: IO,
dialect: str,
fieldnames: List[str] = None,
first_line_is_column_header: bool = True,
discard: int = None,
load_at_most: int = None,
) -> Iterable[Dict]:
"""Returns an iterable of dict. Must be iterated while file is still open.
Args:
file:
An open file.
dialect:
As used in built-in module `csv`.
fieldnames:
TODO: Pending documentation for 'fieldnames'
first_line_is_column_header:
If True, parses first line as column headers.
discard:
Non-negative integer or None. Initial rows of _data_ to discard.
load_at_most:
Non-negative integer or None. Rows of _data_ to load.
Notes:
Use 'excel' dialect for CSV. Use 'excel-tab' for TSV.
Warnings:
Must be iterated while file is still open.
"""
kwargs = {
'fieldnames': fieldnames,
'dialect': dialect,
}
if not first_line_is_column_header and fieldnames is None:
# use 'Column X' as fieldnames like in OpenRefine
first_line = file.readline(1)
file.seek(-1)
delimiter = csv.get_dialect(dialect).delimiter
num_cols = len(first_line.split(delimiter))
kwargs['fieldnames'] = [f'Column {i + 1}' for i in range(num_cols)]
if first_line_is_column_header and fieldnames is not None:
raise NotImplementedError("Changing column names isn't supported for simplicity")
reader = csv.DictReader(file, **select_not_null(kwargs, 'fieldnames', 'dialect'))
stop = None
if load_at_most is not None:
stop = load_at_most
if discard is not None:
stop += discard
return islice(reader, discard, stop)