22Preprocess the output received from server and interface as a final result to the client
33"""
44import os
5+ import re
56import tempfile
67import warnings
78import collections
9+ from statistics import mode
10+ from typing import List
811
912import pandas as pd
1013
1114
1215class ConvertTo :
13- """Convert tabular JSON to an user requested output format"""
14- FORMATS = {"df" , "dataframe" , "json" , "csv" , "dict" }
16+ FORMATS = {"df" , "dataframe" , "json" , "csv" , "dict" , "xlsx" , "excel" }
1517 DEFAULT = "df"
1618
17- def __init__ (self , data : dict , fmt : str = DEFAULT , indexing : bool = False ):
19+ def __init__ (self , server_response : dict , output_format : str = DEFAULT , indexing : bool = False , table_obj = "TableJson" ):
1820 """
19-
20- :param data : Tabular JSON data from server
21- :param fmt : format to be converted into
21+ Convert the server response to an user requested output format on Tables
22+ :param server_response : Tabular JSON data from server
23+ :param output_format : format to be converted into
2224 :param indexing: row & column index consideration in the output
2325 """
24- self .data = data
25- self .output = self ._converter (fmt .lower (), indexing = indexing )
26+ self .server_response = server_response
27+ self .output = self ._converter (output_format .lower (), indexing = indexing , table_obj = table_obj )
2628
27- def _converter (self , fmt : str , indexing : bool = False ) -> list :
29+ def _converter (self , fmt : str , indexing : bool = False , table_obj = "TableJson" ) -> list :
2830 """
2931 Actual conversion takes place here using Pandas
3032 :param fmt: format to be converted into
3133 :param indexing: row index consideration in the output
3234 :return: list of tables from converted into the requested output format
3335 """
3436 dfs = []
35- for table in self .data .get ("Tables" , []):
36- tmp = {int (k ): v for k , v in table ["TableJson" ].items ()}
37+ for table in self .server_response .get ("Tables" , []):
38+ tmp = {int (k ): v for k , v in table [table_obj ].items ()}
3739 # To convert column indices to int to maintain the table order with more than 9 columns
38- cols = [str (x ) for x in sorted ([int (x ) for x in tmp [0 ]])]
40+ cols = [str (x ) for x in sorted ([int (x ) for x in tmp [0 ]])] if tmp else None
3941 # To convert row indices to int and maintain the table order with more than 9 rows
4042 tmp = collections .OrderedDict (sorted (tmp .items ()))
4143 dfs .append (pd .DataFrame .from_dict (tmp , orient = "index" , columns = cols ))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
5254 df .to_csv (csv_name , index = indexing , header = indexing )
5355 output_location .append (csv_name )
5456 return output_location
57+ elif fmt in ("xlsx" , "excel" ):
58+ output_excel_location = os .path .join (tempfile .mkdtemp (), f"_tables_{ len (dfs )} .xlsx" )
59+ if len (dfs ) >= 10 :
60+ warnings .warn (f"There are { dfs } tables extracted. Consider to change the output_format to 'csv' instead" )
61+ with pd .ExcelWriter (output_excel_location ) as writer :
62+ for n , df in enumerate (dfs ):
63+ df .to_excel (writer , f'table_{ n + 1 } ' )
64+ writer .save ()
65+ return [output_excel_location ]
5566 elif fmt == "json" :
5667 return [df .to_json () for df in dfs ]
5768 else :
5869 warn_msg = f"Supported output formats { self .FORMATS } only. Assigned to default: { self .DEFAULT } "
5970 warnings .warn (warn_msg )
6071 return dfs
72+
73+
74+ class MakeCorrections :
75+ def __init__ (self , et_resp : dict = None , dataframes : List [pd .DataFrame ] = None ):
76+ """
77+ To apply post processing techniques on the output
78+ :param et_resp: ExtractTable response
79+ :param dataframes: user preferred dataframe(s).
80+ Default assumes all dataframes from the extracttable response, `et_resp`.
81+ If both `et_resp` and `dataframes` are provided, the later is considered for the processing
82+ """
83+ if et_resp :
84+ self .dataframes = ConvertTo (data = et_resp ).output
85+
86+ if not et_resp :
87+ try :
88+ self .dataframes = self .__isacceptable__ (dataframes )
89+ except ValueError :
90+ raise ValueError ("Either ExtractTable response or your preferred list of pandas dataframes is required" )
91+
92+ @staticmethod
93+ def __isacceptable__ (dfs ) -> List [pd .DataFrame ]:
94+ """Validate the `dataframes` param"""
95+ if type (dfs ) is list :
96+ if all ([type (df ) is pd .DataFrame for df in dfs ]):
97+ return dfs
98+ elif type (dfs ) is pd .DataFrame :
99+ return [dfs ]
100+ raise ValueError ("Dataframes should be list of dataframes or a dataframe" )
101+
102+ def split_merged_rows (self ) -> List [pd .DataFrame ]:
103+ """
104+ To split the merged rows into possible multiple rows
105+ :return: reformatted list of dataframes
106+ """
107+ for df_idx , each_df in enumerate (self .dataframes ):
108+ reformat = []
109+ for row in each_df .to_numpy ():
110+ row = list (row )
111+
112+ # looks like line separator is " "
113+ seperators = [col .strip ().count (" " ) for col in row ]
114+ # Statistical mode to assume the number of rows merged
115+ mode_ = mode (seperators )
116+
117+ if mode_ :
118+ # split the merged rows inside the col
119+ tmp = [col .strip ().split (' ' , mode_ ) for col in row ]
120+ for idx in range (len (tmp [0 ])):
121+ tmp_ = []
122+ for x in range (len (tmp )):
123+ try :
124+ val = tmp [x ][idx ]
125+ except IndexError :
126+ val = ""
127+ tmp_ .append (val )
128+ reformat .append (tmp_ )
129+ else :
130+ reformat .append (row )
131+
132+ self .dataframes [df_idx ] = pd .DataFrame (reformat )
133+
134+ return self .dataframes
135+
136+ def split_merged_columns (self , columns_idx : List [int ] = None , force_split : bool = False ) -> List [pd .DataFrame ]:
137+ """
138+ To split the merged columns into possible multiple columns
139+ :param columns_idx: user preferred columns indices.
140+ Default loops through all columns to find numeric or decimal columns
141+ :param force_split: To force split through the columns
142+ :return: reformatted list of dataframes
143+ """
144+ # TODO: Should we consider delimiter_pattern for the split?
145+ for df_idx , df in enumerate (self .dataframes ):
146+ if not columns_idx :
147+ columns_idx = df .columns
148+
149+ columns_idx = [str (x ) for x in columns_idx ]
150+ reformat = []
151+ for col_idx in columns_idx :
152+ tmp = df [col_idx ].str .split (expand = True )
153+
154+ if not any ([not any (tmp .isna ().any ()), force_split ]) or tmp .shape [- 1 ] == 1 :
155+ reformat .append (df [col_idx ].tolist ())
156+ # If user wanted force_split or the split columns have all cell values
157+ # then proceed next
158+ else :
159+ reformat .extend ([tmp [each ].tolist () for each in tmp .columns ])
160+
161+ self .dataframes [df_idx ] = pd .DataFrame (reformat ).T
162+
163+ return self .dataframes
164+
165+ def fix_decimal_format (self , columns_idx : List [int ] = None , decimal_separator : str = "." , thousands_separator : str = "," , decimal_position : int = 2 ) -> List [pd .DataFrame ]:
166+ """
167+ To fix decimal and thousands separator values. Often commas as detected as period
168+ :param columns_idx: user preferred columns indices.
169+ Default loops through all columns to find numeric or decimal columns
170+ :param decimal_separator: preferred decimal separator
171+ :param thousands_separator: preferred thousands separator
172+ :param decimal_position: preferred decimal position
173+ :return: corrected list of dataframes
174+ """
175+ # TODO: Should we consider only bad confidence values?
176+ reg_ = f"[{ decimal_separator } { thousands_separator } ]"
177+ if decimal_position > 0 :
178+ thou_regex = reg_ + '(?=.*' + reg_ + ')'
179+ else :
180+ thou_regex = reg_
181+ decimal_position = int (decimal_position )
182+
183+ for df_idx , df in enumerate (self .dataframes ):
184+ if not columns_idx :
185+ columns_idx = df .columns
186+ columns_idx = [str (x ) for x in columns_idx ]
187+
188+ for col_idx in columns_idx :
189+ digits = df [col_idx ].str .count (pat = r'\d' ).sum ()
190+ chars = df [col_idx ].str .count (pat = r'[\w]' ).sum ()
191+
192+ if digits / chars < 0.75 :
193+ # To infer a numeric or float column
194+ # Check if the column contains more digits or characters
195+ continue
196+
197+ df [col_idx ] = df [col_idx ].str .strip ()
198+ df [col_idx ].replace (regex = {r'%s' % thou_regex : thousands_separator }, inplace = True )
199+
200+ # To correct decimal position
201+ if not decimal_position > 0 :
202+ continue
203+
204+ for i , _ in enumerate (df [col_idx ]):
205+ if not len (df [col_idx ][i ]) > decimal_position :
206+ # length of atleast decimal_position
207+ continue
208+ elif df [col_idx ][i ][- (decimal_position + 1 )] == decimal_separator :
209+ # nothing to do if decimal separator already in place
210+ continue
211+
212+ # If decimal position is a not alphanumeric
213+ if re .search (r'\W+' , df [col_idx ][i ][- (decimal_position + 1 )]):
214+ digits = len (re .findall (r'\d' , df [col_idx ][i ]))
215+ if digits / len (df [col_idx ][i ]) >= 0.5 :
216+ df [col_idx ][i ] = df [col_idx ][i ][:- (decimal_position + 1 )] + decimal_separator + df [col_idx ][i ][- decimal_position :]
217+
218+ self .dataframes [df_idx ] = df
219+ return self .dataframes
220+
221+ def fix_date_format (self , columns_idx : List [int ] = None , delimiter : str = "/" ):
222+ """
223+ To fix date formats of the column
224+ Eg: 12|1212020 as 12/12/2020
225+ :param columns_idx: user preferred columns indices.
226+ Default loops through all columns to find Date Columns
227+ :param delimiter: "/" or "-" whatelse you prefer
228+ :return: correted list of dataframes
229+ """
230+ date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
231+ for df_idx , df in enumerate (self .dataframes ):
232+ if not columns_idx :
233+ columns_idx = df .columns
234+ columns_idx = [str (x ) for x in columns_idx ]
235+
236+ for col_idx in columns_idx :
237+ dates = df [col_idx ].str .count (pat = date_regex ).sum ()
238+
239+ if not (dates >= len (df ) * 0.75 ):
240+ # To infer a date column
241+ # Check if the column contains digits and non-alpha character greater than column length
242+ continue
243+
244+ df [col_idx ] = df [col_idx ].str .strip ()
245+ df [col_idx ].replace (regex = {date_regex : r'\1%s\4%s\6' % (delimiter , delimiter )}, inplace = True )
246+
247+ self .dataframes [df_idx ] = df
248+
249+ return self .dataframes
0 commit comments