From e21c77a9eb3aff2c3670935de0bc6702edb0b4ec Mon Sep 17 00:00:00 2001 From: Thomas Grandjean Date: Thu, 11 Dec 2025 15:12:36 +0100 Subject: [PATCH 1/3] safe encoding --- pynsee/sirene/search_sirene.py | 83 +++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/pynsee/sirene/search_sirene.py b/pynsee/sirene/search_sirene.py index 222641fd..373bbf29 100644 --- a/pynsee/sirene/search_sirene.py +++ b/pynsee/sirene/search_sirene.py @@ -3,7 +3,11 @@ from functools import lru_cache import itertools +import logging import re +import string +import urllib.parse + from unidecode import unidecode from pynsee.utils.save_df import save_df @@ -11,10 +15,43 @@ from ._request_sirene import _request_sirene from .sirenedataframe import SireneDataFrame -import logging logger = logging.getLogger(__name__) +SEPARATORS = { + # See doc : https://portail-api.insee.fr/catalog/api/2ba0e549-5587-3ef1-9082-99cd865de66f/doc?page=66396595-39c0-44b2-b965-9539c004b244#variables-%C3%A9tablissement + "enseigne1Etablissement": " -'*/()", + "enseigne2Etablissement": " -'*/()", + "enseigne3Etablissement": " -'*/()", + "denominationUsuelleEtablissement": " -'*/()", + "libelleVoieEtablissement": " -?'*/:!()[]", + "libelleCommuneEtablissement": " -?'*/:!()[]", + "libelleCommuneEtrangerEtablissement": " -?'*/:!()[]", + "distributionSpecialeEtablissement": " -?'*/:!()[]", + "libellePaysEtrangerEtablissement": " -?'*/:!()[]", + "libelleCedexEtablissement": " -?'*/:!()[]", + "nomUsageUniteLegale": " -?'/", + "prenom1UniteLegale": " -?'/", + "prenom2UniteLegale": " -?'/", + "prenom3UniteLegale": " -?'/", + "prenom4UniteLegale": " -?'/", + "prenomUsuelUniteLegale": " -?'/", + "nomUniteLegale": " -?'/", + "denominationUniteLegale": " -?'/", + "pseudonymeUniteLegale": " -?'/", + "libelleNationaliteUniteLegale": " -?'/", + "denominationUsuelle1UniteLegale": " -?'/", + "denominationUsuelle2UniteLegale": " -?'/", + "denominationUsuelle3UniteLegale": " -?'/", + "sigleUniteLegale": " .-?'/", + "complementAdresseEtablissement": " ?'*/:!()[]", + "typeVoieEtablissement": " ?'*/:!()[]", + # Note : for the following fields, the "punctuation+blanc*" rule is managed + # directly in the code, see "encode" func + "numeroVoieEtablissement": " ", + "dernierNumeroVoieEtablissement": " ", +} + @lru_cache(maxsize=None) def _warning_search_sirene(): @@ -32,6 +69,46 @@ def _warning_data_save(): ) +def safe_encode(field: str, val: str) -> str: + """ + Encode any special characters in a pattern, except for characters + mentionned as separators in the API's doc. + + See doc : https://portail-api.insee.fr/catalog/api/2ba0e549-5587-3ef1-9082-99cd865de66f/doc?page=66396595-39c0-44b2-b965-9539c004b244#variables-%C3%A9tablissement + + Parameters + ---------- + field : str + Target field of the query (ex. 'denominationUniteLegale'). + val : str + Subquery targetting this field (ex. "ART & CLIM SARL") + + Returns + ------- + str + Cleaned val. + + Example + ------- + safe_encode("denominationUniteLegale", "ART & CLIM SARL") + >>> 'ART %26 CLIM SARL' + + """ + if field in { + "numeroVoieEtablissement", + "dernierNumeroVoieEtablissement", + }: + # "ponctuation+blanc*" rule -> replace by single whitespace which + # will be preserved anyway (and then used by the API to split + # the query) + safe = "( |([" + string.punctuation + "] *))" + val = re.sub(safe, " ", val) + try: + return urllib.parse.quote(val, safe=SEPARATORS[field]) + except KeyError: + return val + + @save_df(day_lapse_max=30, cls=SireneDataFrame) def search_sirene( variable, @@ -161,6 +238,10 @@ def search_sirene( if isinstance(pattern, str): pattern = [pattern] + pattern = [ + safe_encode(field, val) for field, val in zip(variable, pattern) + ] + list_siren_hist_variable = [ "nomUniteLegale", # "nomUsageUniteLegale", # From 2fb71d3c5213c590a096676bfdd05d97c84e6cd7 Mon Sep 17 00:00:00 2001 From: Thomas Grandjean Date: Mon, 15 Dec 2025 09:06:04 +0100 Subject: [PATCH 2/3] Fix ref in docstring Co-authored-by: tfardet <79037344+tfardet@users.noreply.github.com> --- pynsee/sirene/search_sirene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynsee/sirene/search_sirene.py b/pynsee/sirene/search_sirene.py index 373bbf29..9f08f614 100644 --- a/pynsee/sirene/search_sirene.py +++ b/pynsee/sirene/search_sirene.py @@ -47,7 +47,7 @@ "complementAdresseEtablissement": " ?'*/:!()[]", "typeVoieEtablissement": " ?'*/:!()[]", # Note : for the following fields, the "punctuation+blanc*" rule is managed - # directly in the code, see "encode" func + # directly in the code, see "safe_encode" func "numeroVoieEtablissement": " ", "dernierNumeroVoieEtablissement": " ", } From 6e383c65918a7ec8eb21b3773dcb89659ee10f0e Mon Sep 17 00:00:00 2001 From: Thomas Grandjean Date: Mon, 15 Dec 2025 09:09:30 +0100 Subject: [PATCH 3/3] Fix punctuation pattern Co-authored-by: tfardet <79037344+tfardet@users.noreply.github.com> --- pynsee/sirene/search_sirene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynsee/sirene/search_sirene.py b/pynsee/sirene/search_sirene.py index 9f08f614..5256078e 100644 --- a/pynsee/sirene/search_sirene.py +++ b/pynsee/sirene/search_sirene.py @@ -101,7 +101,7 @@ def safe_encode(field: str, val: str) -> str: # "ponctuation+blanc*" rule -> replace by single whitespace which # will be preserved anyway (and then used by the API to split # the query) - safe = "( |([" + string.punctuation + "] *))" + safe = "( |([" + string.punctuation + "] +))" val = re.sub(safe, " ", val) try: return urllib.parse.quote(val, safe=SEPARATORS[field])