sheetcloud.sheets
1import logging 2import io 3import pandas as pd 4 5from datetime import datetime, timezone 6from typing import Tuple, Dict, List, Optional 7 8from sheetcloud.conn import service, ENV_SHEETCLOUD_DEV 9from sheetcloud.utils import get_modification_datetime_from_file, create_dir 10from sheetcloud.templates import load_template 11 12logger = logging.getLogger("SHEETCLOUD SHEETS") 13logging.basicConfig( 14 format="\x1b[38;5;224m %(levelname)8s \x1b[0m | \x1b[38;5;39m %(name)s \x1b[0m | %(message)s", 15 level=logging.DEBUG, 16) 17 18if not ENV_SHEETCLOUD_DEV: 19 import warnings 20 21 warnings.filterwarnings("ignore") 22 23SHEETCLOUD_CACHE_PATH = ".tmp" 24 25 26def _cache_read(id: str, worksheet: str, ts: datetime) -> Optional[pd.DataFrame]: 27 create_dir(SHEETCLOUD_CACHE_PATH) 28 fname = f"{SHEETCLOUD_CACHE_PATH}/{id}_{worksheet}.parquet" 29 ts_local = get_modification_datetime_from_file(fname) 30 if ts_local is not None and ts_local < ts: 31 logger.debug( 32 f"Restore {worksheet} from local cache. Local ts={ts_local}, remote ts={ts}." 33 ) 34 return pd.read_parquet(fname) 35 return None 36 37 38def _cache_write( 39 id: str, worksheet: str, df: pd.DataFrame, append: bool, ts: datetime 40) -> None: 41 create_dir(SHEETCLOUD_CACHE_PATH) 42 fname = f"{SHEETCLOUD_CACHE_PATH}/{id}_{worksheet}.parquet" 43 if not append: 44 df.to_parquet(fname) 45 ts_local = get_modification_datetime_from_file(fname) 46 logger.debug( 47 f"Store worksheet {worksheet} in local cache. Local ts={ts_local}, remote ts={ts}." 48 ) 49 else: 50 ts_local = get_modification_datetime_from_file(fname) 51 logger.debug( 52 f"Checking if data can be appended to worksheet {worksheet} in local cache. Local ts={ts_local}, remote ts={ts}." 53 ) 54 if ts_local is not None and ts_local < ts: 55 df_org = pd.read_parquet(fname) 56 df = pd.concat([df_org, df], ignore_index=True) 57 df.to_parquet(fname) 58 ts_local = get_modification_datetime_from_file(fname) 59 logger.debug( 60 f"Appending data to worksheet {worksheet} in local cache. Local ts={ts_local}, remote ts={ts}." 61 ) 62 63 64def list_spreadsheets() -> List[Dict]: 65 res = service("/sheets/list", method="post") 66 if "sheets" in res: 67 logger.info(f'List contains a total of {len(res["sheets"])} spreadsheets.') 68 return res["sheets"] 69 return list() 70 71 72def list_worksheets_in_spreadsheet(sheet_url_or_name: str) -> List[str]: 73 res = service( 74 "/sheets/list", 75 method="post", 76 params={"spreadsheet_url_or_name": sheet_url_or_name}, 77 ) 78 if "sheets" in res: 79 logger.info(f'List contains a total of {len(res["sheets"])} worksheets.') 80 return res["sheets"] 81 return list() 82 83 84def get_modified_datetime(sheet_url_or_name: str) -> Tuple[datetime, str, str]: 85 res = service( 86 "/sheets/modified-time", 87 params={"spreadsheet_url_or_name": sheet_url_or_name}, 88 method="post", 89 ) 90 if "timestamp" in res: 91 ts = datetime.fromisoformat(res["timestamp"]) 92 ts = ts.astimezone(timezone.utc) 93 return ts, res["id"], res["title"] 94 logger.warning( 95 f"Could not find timestamp of speadsheet {sheet_url_or_name}. Returning current time." 96 ) 97 return datetime.now(), None, None 98 99 100def format_spreadsheet( 101 sheet_url_or_name: str, 102 worksheet_name: str, 103 a1range_format_list: List[Tuple[str, Dict]], 104 auto_resize: bool = True, 105) -> None: 106 fmts = list() 107 for e in a1range_format_list: 108 print(e) 109 fmts.append({"a1range": e[0], "format": e[1]}) 110 print(fmts) 111 params = { 112 "spreadsheet_url_or_name": sheet_url_or_name, 113 "worksheet_name": worksheet_name, 114 "auto_resize": auto_resize, 115 } 116 _ = service("/sheets/format", data={"formats": fmts}, params=params, method="post") 117 118 119def share( 120 sheet_url_or_name: str, 121 share_emails_write_access: Optional[List[str]] = None, 122 share_emails_read_only_access: Optional[List[str]] = None, 123 notification_msg: Optional[str] = None, 124) -> None: 125 params = {"spreadsheet_url_or_name": sheet_url_or_name} 126 data = { 127 "emails_write_access": share_emails_write_access, 128 "emails_read_access": share_emails_read_only_access, 129 "notification_msg": notification_msg, 130 } 131 resp = service("/sheets/share", params=params, data=data) 132 return resp 133 134 135def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame: 136 for c in df.columns: 137 c_num = pd.to_numeric(df[c], errors="ignore") 138 c_dt = pd.to_datetime(df[c], errors="ignore", infer_datetime_format=True) 139 140 if c_dt.dtype == "object" and not c_num.dtype == "object": 141 df[c] = c_num 142 if not c_dt.dtype == "object" and c_num.dtype == "object": 143 df[c] = c_dt 144 return df 145 146 147def read( 148 sheet_url_or_name: str, 149 worksheet_name: str, 150 try_infer_dtypes: bool = True, 151 cache: bool = True, 152) -> pd.DataFrame: 153 """Read worksheet from spreadsheet into a DataFrame. 154 155 Args: 156 sheet_url_or_name (str): The name or URL of an spreadsheet 157 worksheet_name (str): The worksheet name 158 try_infer_dtypes (bool, optional): Dtypes such as float/int need to be inferred otherwise dtype will be object. Defaults to True. 159 cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True. 160 161 Returns: 162 pd.DataFrame: DataFrame containing the data of `worksheet_name` of spreadsheet `sheet_url_or_name` 163 """ 164 ts = None 165 id = None 166 if cache: 167 ts, id, title = get_modified_datetime(sheet_url_or_name) 168 if id is not None: 169 df = _cache_read(id=id, worksheet=worksheet_name, ts=ts) 170 if df is not None: 171 return df 172 173 # no cached version available 174 headers = { 175 "Content-Type": "application/json", 176 "Accept": "application/octet-stream", 177 } 178 params = { 179 "spreadsheet_url_or_name": sheet_url_or_name, 180 "worksheet_name": worksheet_name, 181 } 182 content = service( 183 "/sheets/read", params=params, headers=headers, method="post", return_dict=False 184 ) 185 if isinstance(content, dict) or isinstance(content, str): 186 logger.error(content) 187 return None 188 df = pd.read_parquet( 189 io.BytesIO(content), engine="pyarrow", use_nullable_dtypes=True 190 ) 191 if cache and id is not None: 192 _cache_write(id, worksheet_name, df, False, ts) 193 194 if try_infer_dtypes: 195 df = infer_dtypes(df) 196 return df 197 198 199def write( 200 sheet_url_or_name: str, 201 worksheet_name: str, 202 df: pd.DataFrame, 203 append: bool = False, 204 cache: bool = True, 205) -> None: 206 """Write a DataFrame to a specific worksheet within a given spreadsheet. If the spreadsheet `sheet_url_or_name` does not exist, create a 207 new one. Same holds true for worksheet `worksheet_name`. 208 209 Args: 210 sheet_url_or_name (str): The name or URL of an spreadsheet 211 worksheet_name (str): The worksheet name 212 df (pd.DataFrame): Data to store. 213 append (bool, optional): Append the data instead of overwrite. Assumes the worksheet exists. Defaults to False. 214 cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True. 215 """ 216 # make dates json serializable 217 dfc = df.select_dtypes(include=["datetime", "datetimetz"]) 218 df[dfc.columns] = dfc.astype("str") 219 220 if cache: 221 ts, id, title = get_modified_datetime(sheet_url_or_name) 222 if id is not None: 223 _cache_write(id=id, worksheet=worksheet_name, df=df, append=append, ts=ts) 224 225 with io.BytesIO() as memory_buffer: 226 df.to_parquet(memory_buffer, compression="gzip", engine="pyarrow") 227 memory_buffer.seek(0) 228 # need to send files separately 229 files = {"file": ("Test", memory_buffer, "application/octet-stream")} 230 params = { 231 "spreadsheet_url_or_name": sheet_url_or_name, 232 "worksheet_name": worksheet_name, 233 } 234 endpoint = "/sheets/append" if append else "/sheets/write" 235 resp = service(endpoint, params=params, files=files, method="post") 236 print(resp) 237 238 239def append( 240 sheet_url_or_name: str, worksheet_name: str, df: pd.DataFrame, cache: bool = True 241) -> None: 242 write(sheet_url_or_name, worksheet_name, df, append=True, cache=cache) 243 244 245if __name__ == "__main__": 246 print("Start connecting...") 247 print(list_spreadsheets()) 248 # print(list_worksheets_in_spreadsheet('sheetcloud-test')) 249 250 # print(sheets) 251 df = pd.read_csv( 252 "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv" 253 ) 254 df["a_date_time"] = datetime.now() 255 df["an_int_column"] = 10 256 # print(df.info()) 257 write("sheetcloud-test", "write-test", df) 258 259 df = read("sheetcloud-test", "write-test") 260 261 print(df.info()) 262 # print(get_modified_datetime('sheetcloud-test')) 263 264 df = pd.DataFrame( 265 [[1, 2, 3], [4, pd.NA, 6], [7, 7, pd.NA]], columns=["col1", "col2", "col3"] 266 ) 267 # df = pd.read_csv('../check.csv') 268 # df = pd.concat([df, df, df, df], ignore_index=True) # ~2.8m entries (incl. NA) 269 # append('sheetcloud-test', 'write-test', df) 270 # write('sheetcloud-test', 'write-test', df) 271 # write('sheetcloud-test-2', 'write-test', df) 272 # share('sheetcloud-test-1', share_emails_read_only_access=['nico.goernitz@gmail.com'], share_emails_write_access=['nico@morphais.com', 'abc@def.com'], notification_msg='Blubb blubb') 273 274 # from sheetcloud.formats import data_small, header_red 275 # format_spreadsheet('sheetcloud-test', 'write-test', [('A1:F1', header_red)], auto_resize=False) 276 # format_spreadsheet('sheetcloud-test', 'write-test', [('A1:F1', header_red), ('A2:F10', data_small)], auto_resize=False) 277 278 # webbrowser.open(f'{URL_SHEETCLOUD_API}login') 279 print("Done")
73def list_worksheets_in_spreadsheet(sheet_url_or_name: str) -> List[str]: 74 res = service( 75 "/sheets/list", 76 method="post", 77 params={"spreadsheet_url_or_name": sheet_url_or_name}, 78 ) 79 if "sheets" in res: 80 logger.info(f'List contains a total of {len(res["sheets"])} worksheets.') 81 return res["sheets"] 82 return list()
85def get_modified_datetime(sheet_url_or_name: str) -> Tuple[datetime, str, str]: 86 res = service( 87 "/sheets/modified-time", 88 params={"spreadsheet_url_or_name": sheet_url_or_name}, 89 method="post", 90 ) 91 if "timestamp" in res: 92 ts = datetime.fromisoformat(res["timestamp"]) 93 ts = ts.astimezone(timezone.utc) 94 return ts, res["id"], res["title"] 95 logger.warning( 96 f"Could not find timestamp of speadsheet {sheet_url_or_name}. Returning current time." 97 ) 98 return datetime.now(), None, None
101def format_spreadsheet( 102 sheet_url_or_name: str, 103 worksheet_name: str, 104 a1range_format_list: List[Tuple[str, Dict]], 105 auto_resize: bool = True, 106) -> None: 107 fmts = list() 108 for e in a1range_format_list: 109 print(e) 110 fmts.append({"a1range": e[0], "format": e[1]}) 111 print(fmts) 112 params = { 113 "spreadsheet_url_or_name": sheet_url_or_name, 114 "worksheet_name": worksheet_name, 115 "auto_resize": auto_resize, 116 } 117 _ = service("/sheets/format", data={"formats": fmts}, params=params, method="post")
136def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame: 137 for c in df.columns: 138 c_num = pd.to_numeric(df[c], errors="ignore") 139 c_dt = pd.to_datetime(df[c], errors="ignore", infer_datetime_format=True) 140 141 if c_dt.dtype == "object" and not c_num.dtype == "object": 142 df[c] = c_num 143 if not c_dt.dtype == "object" and c_num.dtype == "object": 144 df[c] = c_dt 145 return df
148def read( 149 sheet_url_or_name: str, 150 worksheet_name: str, 151 try_infer_dtypes: bool = True, 152 cache: bool = True, 153) -> pd.DataFrame: 154 """Read worksheet from spreadsheet into a DataFrame. 155 156 Args: 157 sheet_url_or_name (str): The name or URL of an spreadsheet 158 worksheet_name (str): The worksheet name 159 try_infer_dtypes (bool, optional): Dtypes such as float/int need to be inferred otherwise dtype will be object. Defaults to True. 160 cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True. 161 162 Returns: 163 pd.DataFrame: DataFrame containing the data of `worksheet_name` of spreadsheet `sheet_url_or_name` 164 """ 165 ts = None 166 id = None 167 if cache: 168 ts, id, title = get_modified_datetime(sheet_url_or_name) 169 if id is not None: 170 df = _cache_read(id=id, worksheet=worksheet_name, ts=ts) 171 if df is not None: 172 return df 173 174 # no cached version available 175 headers = { 176 "Content-Type": "application/json", 177 "Accept": "application/octet-stream", 178 } 179 params = { 180 "spreadsheet_url_or_name": sheet_url_or_name, 181 "worksheet_name": worksheet_name, 182 } 183 content = service( 184 "/sheets/read", params=params, headers=headers, method="post", return_dict=False 185 ) 186 if isinstance(content, dict) or isinstance(content, str): 187 logger.error(content) 188 return None 189 df = pd.read_parquet( 190 io.BytesIO(content), engine="pyarrow", use_nullable_dtypes=True 191 ) 192 if cache and id is not None: 193 _cache_write(id, worksheet_name, df, False, ts) 194 195 if try_infer_dtypes: 196 df = infer_dtypes(df) 197 return df
Read worksheet from spreadsheet into a DataFrame.
Args: sheet_url_or_name (str): The name or URL of an spreadsheet worksheet_name (str): The worksheet name try_infer_dtypes (bool, optional): Dtypes such as float/int need to be inferred otherwise dtype will be object. Defaults to True. cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True.
Returns:
pd.DataFrame: DataFrame containing the data of worksheet_name
of spreadsheet sheet_url_or_name
200def write( 201 sheet_url_or_name: str, 202 worksheet_name: str, 203 df: pd.DataFrame, 204 append: bool = False, 205 cache: bool = True, 206) -> None: 207 """Write a DataFrame to a specific worksheet within a given spreadsheet. If the spreadsheet `sheet_url_or_name` does not exist, create a 208 new one. Same holds true for worksheet `worksheet_name`. 209 210 Args: 211 sheet_url_or_name (str): The name or URL of an spreadsheet 212 worksheet_name (str): The worksheet name 213 df (pd.DataFrame): Data to store. 214 append (bool, optional): Append the data instead of overwrite. Assumes the worksheet exists. Defaults to False. 215 cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True. 216 """ 217 # make dates json serializable 218 dfc = df.select_dtypes(include=["datetime", "datetimetz"]) 219 df[dfc.columns] = dfc.astype("str") 220 221 if cache: 222 ts, id, title = get_modified_datetime(sheet_url_or_name) 223 if id is not None: 224 _cache_write(id=id, worksheet=worksheet_name, df=df, append=append, ts=ts) 225 226 with io.BytesIO() as memory_buffer: 227 df.to_parquet(memory_buffer, compression="gzip", engine="pyarrow") 228 memory_buffer.seek(0) 229 # need to send files separately 230 files = {"file": ("Test", memory_buffer, "application/octet-stream")} 231 params = { 232 "spreadsheet_url_or_name": sheet_url_or_name, 233 "worksheet_name": worksheet_name, 234 } 235 endpoint = "/sheets/append" if append else "/sheets/write" 236 resp = service(endpoint, params=params, files=files, method="post") 237 print(resp)
Write a DataFrame to a specific worksheet within a given spreadsheet. If the spreadsheet sheet_url_or_name
does not exist, create a
new one. Same holds true for worksheet worksheet_name
.
Args: sheet_url_or_name (str): The name or URL of an spreadsheet worksheet_name (str): The worksheet name df (pd.DataFrame): Data to store. append (bool, optional): Append the data instead of overwrite. Assumes the worksheet exists. Defaults to False. cache (bool, optional): Caching large amounts of read-only data locally is much faster when accessing it multiple times. Defaults to True.