gsheet.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. import datetime
  2. import io
  3. import os
  4. from enum import Enum
  5. from urllib.parse import urlparse
  6. from uuid import uuid4
  7. import pandas as pd
  8. import requests
  9. from pandera.typing import DataFrame
  10. from pandera.errors import SchemaError
  11. from .validation import ContactSchema, CreneauSchema, CreneauDataSchema
  12. planning_gid = "1001381542"
  13. creneau_gid = "1884137958"
  14. benevole_gid = "82247394"
  15. class ParserState(Enum):
  16. STARTING_VALUE = 0
  17. READING_VALUE = 1
  18. ESCAPING = 2
  19. CLOSING_ESCAPE = 3
  20. def split_csv_row(raw_data: str, separator: str = ",", escape: str = '"') -> list[str]:
  21. state: ParserState = ParserState.STARTING_VALUE
  22. arr = []
  23. current_item = ""
  24. for c in raw_data:
  25. if state == ParserState.STARTING_VALUE:
  26. if c == escape:
  27. state = ParserState.ESCAPING
  28. elif c == separator:
  29. arr.append("")
  30. else:
  31. state = ParserState.READING_VALUE
  32. current_item = c
  33. elif state == ParserState.READING_VALUE:
  34. if c == separator:
  35. state = ParserState.STARTING_VALUE
  36. arr.append(current_item)
  37. current_item = ""
  38. else:
  39. current_item += c
  40. elif state == ParserState.ESCAPING:
  41. if c == escape:
  42. state = ParserState.CLOSING_ESCAPE
  43. else:
  44. current_item += c
  45. elif state == ParserState.CLOSING_ESCAPE:
  46. if c == escape:
  47. state = ParserState.ESCAPING
  48. current_item += c
  49. else:
  50. state = ParserState.READING_VALUE
  51. arr.append(current_item)
  52. current_item = ""
  53. arr.append(current_item)
  54. return arr
  55. class InvalidUrlError(Exception):
  56. pass
  57. class ParsingError(Exception):
  58. pass
  59. def extract_doc_uid(url: str) -> str:
  60. res = urlparse(url)
  61. if res.netloc != "docs.google.com":
  62. raise InvalidUrlError("Invalid netloc")
  63. if not res.path.startswith("/spreadsheets/d/"):
  64. raise InvalidUrlError("Invalid path")
  65. doc_id = res.path.split("/")[3]
  66. l_doc_id = len(doc_id)
  67. if l_doc_id < 32 and 50 > l_doc_id:
  68. raise InvalidUrlError("Invalid path")
  69. return doc_id
  70. def build_sheet_url(doc_id, sheet_id):
  71. return f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid={sheet_id}"
  72. def downloadAndSave(doc_ui, sheet_gid, fname):
  73. url = build_sheet_url(doc_ui, sheet_gid)
  74. print("Downloading " + fname)
  75. rep = requests.get(url)
  76. with open(fname, "wb") as f:
  77. f.write(rep.content)
  78. def getContactDataFrame(
  79. csv_filename: str, skiprows: int = 2
  80. ) -> DataFrame[ContactSchema]:
  81. df_contact = pd.read_csv(csv_filename, skiprows=skiprows)
  82. column_to_drop = [name for name in df_contact.columns if "Unnamed" in name]
  83. df_contact.drop(column_to_drop, axis=1, inplace=True)
  84. # Filter out empty name
  85. df_contact = df_contact[~df_contact.Nom.isnull()]
  86. df_contact.reset_index()
  87. # coerce SMS to boolean
  88. df_contact["SMS"] = df_contact["SMS"] == "Oui"
  89. # create unique contact key
  90. df_contact["key"] = df_contact["Prénom"] + " " + df_contact.Nom.str.slice(0, 1)
  91. return ContactSchema.validate(df_contact)
  92. def getCreneauDataFrame(csv_filename: str) -> DataFrame[CreneauDataSchema]:
  93. df_creneau = pd.read_csv(csv_filename)
  94. df_creneau.columns = ["title", "lieu", "description", "responsable", "tags"]
  95. df_creneau.loc[df_creneau.tags.isnull(), "tags"] = ""
  96. df_creneau.loc[df_creneau.lieu.isnull(), "lieu"] = ""
  97. return CreneauDataSchema.validate(df_creneau)
  98. def getPlanningDataFrame(csv_filename, starting_date, skip_column=3):
  99. list_creneau = []
  100. with io.open(csv_filename, "r", encoding="utf-8") as f:
  101. datas = [split_csv_row(s.replace("\n", "")) for s in f.readlines()]
  102. def getDate(day: str) -> datetime.datetime:
  103. s = day.lower()
  104. if s.startswith("mercredi"):
  105. return starting_date + datetime.timedelta(days=-3)
  106. elif s.startswith("jeudi"):
  107. return starting_date + datetime.timedelta(days=-2)
  108. elif s.startswith("vendredi"):
  109. return starting_date + datetime.timedelta(days=-1)
  110. elif s.startswith("samedi"):
  111. return starting_date
  112. elif s.startswith("dimanche"):
  113. return starting_date + datetime.timedelta(days=1)
  114. elif s.startswith("lundi"):
  115. return starting_date + datetime.timedelta(days=2)
  116. raise KeyError("This day is not valid : " + s)
  117. def getTime(time_str: str) -> datetime.timedelta:
  118. splitted_time = time_str.split("h")
  119. hours = int(splitted_time[0])
  120. if hours < 5:
  121. hours += 24
  122. if len(splitted_time) > 1 and splitted_time[1] != "":
  123. return datetime.timedelta(hours=hours, minutes=int(splitted_time[1]))
  124. else:
  125. return datetime.timedelta(hours=hours)
  126. def getStartEnd(time_str: str) -> tuple:
  127. pair = time_str.split("-")
  128. if len(pair) == 2:
  129. return getTime(pair[0]), getTime(pair[1])
  130. else:
  131. start = getTime(time_str.split("+")[0])
  132. return start, start + datetime.timedelta(hours=1)
  133. # Parse headers
  134. headers = datas[skip_column : skip_column + 2]
  135. days_str = headers[0]
  136. hours = headers[1]
  137. column_to_dates: dict[int, dict[str, datetime.datetime]] = {}
  138. for i in range(skip_column, len(days_str)):
  139. day = getDate(days_str[i])
  140. start, end = getStartEnd(hours[i])
  141. column_to_dates[i] = {"start": day + start, "end": day + end}
  142. list_creneau: list[dict] = []
  143. for i in range(5, len(datas)):
  144. row = datas[i]
  145. current_benevole_name = ""
  146. current_time: dict[str, datetime.datetime] = {}
  147. for j in range(skip_column, len(row)):
  148. if (current_benevole_name != "") and (
  149. row[j] == "" or row[j] != current_benevole_name
  150. ):
  151. new_creneau = {
  152. "id": str(uuid4()),
  153. "template_id": row[0],
  154. "nom": row[1],
  155. "benevole_nom": current_benevole_name,
  156. "ligne": i + 1,
  157. **current_time,
  158. }
  159. list_creneau.append(new_creneau)
  160. current_benevole_name = ""
  161. current_time = {}
  162. if row[j] != "":
  163. current_benevole_name = row[j]
  164. if len(current_time.keys()) == 0:
  165. current_time = column_to_dates[j].copy()
  166. else:
  167. current_time["end"] = column_to_dates[j]["end"]
  168. if current_benevole_name != "":
  169. new_creneau = {
  170. "id": str(uuid4()),
  171. "template_id": row[0],
  172. "nom": row[1],
  173. "benevole_nom": current_benevole_name,
  174. "ligne": i + 1,
  175. **current_time,
  176. }
  177. list_creneau.append(new_creneau)
  178. print(f"{len(list_creneau)} créneaux trouvés")
  179. df = pd.DataFrame.from_dict(list_creneau)
  180. return CreneauSchema.validate(df)
  181. def parseGsheet(doc_uuid: str, saturday_date: datetime.datetime):
  182. suffix = "_2023"
  183. fname_planning = f"./planning{suffix}.csv"
  184. fname_creneau = f"./creneau{suffix}.csv"
  185. fname_contact = f"./benevole{suffix}.csv"
  186. downloadAndSave(doc_uuid, planning_gid, fname_planning)
  187. downloadAndSave(doc_uuid, creneau_gid, fname_creneau)
  188. downloadAndSave(doc_uuid, benevole_gid, fname_contact)
  189. try:
  190. df_contact = getContactDataFrame(fname_contact)
  191. except SchemaError as exc:
  192. msg = "Donnée erronée sur les bénévoles\n" + str(exc)
  193. raise ParsingError(msg)
  194. try:
  195. df_creneau = getCreneauDataFrame(fname_creneau)
  196. except SchemaError as exc:
  197. raise ParsingError("Donnée erronée des description des créneaux\n" + str(exc))
  198. try:
  199. df_planning = getPlanningDataFrame(fname_planning, saturday_date)
  200. except SchemaError as exc:
  201. raise ParsingError("Donnée erronée des créneaux\n" + str(exc))
  202. os.remove(fname_planning)
  203. os.remove(fname_creneau)
  204. os.remove(fname_contact)
  205. return df_contact, df_creneau, df_planning