Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
.gradle/
gradle/
gradlew*
node_modules/
# generated
build/
out/
Expand Down
201 changes: 201 additions & 0 deletions ScriptFloraBrasil/fetch_taxa.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// fetch_taxa.ts
import fs from "fs";
import path from "path";
import axios from "axios";
import { parse } from "csv-parse";
import pLimit from "p-limit";

type Row = string[];

// Caminho do CSV (arquivo que você enviou)
const CSV_PATH = "querySELECT.csv";

// Pasta de saída
const OUT_DIR = path.resolve(process.cwd(), "output");
if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR, { recursive: true });

// Configurações
const CONCURRENCY = 5; // número de requisições simultâneas
const RETRIES = 3; // tentativas em caso de erro
const RETRY_DELAY_MS = 1000; // tempo entre retries
const REQUEST_TIMEOUT = 15000; // timeout axios em ms
const BASE_URL = "https://servicos.jbrj.gov.br/v2/flora/taxon/";

function sleep(ms: number) {
return new Promise((res) => setTimeout(res, ms));
}

/**
* Extrai genero e especie de uma row do csv.
* Lógica:
* - Se existir header com "genero" e "especie" (case-insensitive), usa-os.
* - Caso contrário, assume que as duas primeiras colunas são genero e especie.
*/
async function readCsv(
filePath: string
): Promise<{ genero: string; especie: string }[]> {
return new Promise((resolve, reject) => {
const input = fs.createReadStream(filePath);
const rows: { genero: string; especie: string }[] = [];
const parser = parse({
bom: true,
columns: true, // tenta usar header; se não quiser, fallback abaixo
skip_empty_lines: true,
trim: true,
});

let usedColumns = true;

parser.on("readable", () => {
let record;
while ((record = parser.read())) {
// record será um object se columns=true. Precisamos mapear.
const keys = Object.keys(record);
// procurar por colunas que pareçam genero/espécie
const lowerKeys = keys.map((k) => k.toLowerCase());
const gi = lowerKeys.findIndex(
(k) =>
k.includes("gener") || k.includes("gênero") || k.includes("genero")
);
const ei = lowerKeys.findIndex(
(k) =>
k.includes("espec") ||
k.includes("espécie") ||
k.includes("especie") ||
k.includes("species")
);
if (gi !== -1 && ei !== -1) {
rows.push({
genero: record[keys[gi]].trim(),
especie: record[keys[ei]].trim(),
});
} else {
// as keys existem, mas não reconhecemos: fallback para as 2 primeiras colunas
usedColumns = false;
// converter record object para array na ordem das keys
const vals = keys.map((k) => (record[k] ?? "").toString().trim());
rows.push({ genero: vals[0] ?? "", especie: vals[1] ?? "" });
}
}
});

parser.on("error", (err) => {
// fallback: tentar parse sem columns
if (usedColumns) {
// tentar parse como CSV regular
parse(fs.readFileSync(filePath), { bom: true }, (err2, data: Row[]) => {
if (err2) return reject(err2);
for (const r of data) {
if (r.length < 2) continue;
rows.push({ genero: r[0].trim(), especie: r[1].trim() });
}
resolve(rows);
});
} else {
reject(err);
}
});

parser.on("end", () => {
resolve(rows);
});

input.pipe(parser);
});
}

async function fetchWithRetries(
url: string,
triesLeft = RETRIES
): Promise<{ status: number; data: any }> {
try {
const resp = await axios.get(url, { timeout: REQUEST_TIMEOUT });
return { status: resp.status, data: resp.data };
} catch (err: any) {
if (triesLeft > 0) {
await sleep(RETRY_DELAY_MS);
return fetchWithRetries(url, triesLeft - 1);
}
if (err.response) {
return { status: err.response.status, data: err.response.data };
}
throw err;
}
}

function safeFileName(text: string) {
return text.replace(/[^a-z0-9_\-\.]/gi, "_");
}

async function main() {
console.log("Lendo CSV...", CSV_PATH);
const items = await readCsv(CSV_PATH);
console.log(`Linhas carregadas: ${items.length}`);

const limit = pLimit(CONCURRENCY);
const summaryRows: string[] = [];
summaryRows.push(
[
"genero",
"especie",
"taxon_name",
"url",
"http_status",
"output_file",
].join(",")
);

const tasks = items.map((it, idx) =>
limit(async () => {
const genero = (it.genero || "").trim();
const especie = (it.especie || "").trim();
if (!genero || !especie) {
console.warn(`Linha ${idx + 1}: genero/especie vazio, pulando`);
summaryRows.push([genero, especie, "", "", "SKIPPED", ""].join(","));
return;
}
const taxonPlain = `${genero} ${especie}`;
const taxonEncoded = encodeURIComponent(taxonPlain);
const url = BASE_URL + taxonEncoded;

try {
const result = await fetchWithRetries(url);
const fileName = safeFileName(`${genero}_${especie}_${idx + 1}.json`);
const outPath = path.join(OUT_DIR, fileName);
fs.writeFileSync(
outPath,
JSON.stringify(result.data, null, 2),
"utf-8"
);
console.log(`[OK] ${taxonPlain} -> ${result.status} -> ${fileName}`);
summaryRows.push(
[
genero,
especie,
taxonPlain,
url,
String(result.status),
fileName,
].join(",")
);
} catch (err: any) {
console.error(`[ERR] ${taxonPlain}:`, err.message ?? err);
summaryRows.push(
[genero, especie, taxonPlain, url, "ERROR", ""].join(",")
);
}
})
);

await Promise.all(tasks);

const summaryPath = path.join(OUT_DIR, "summary.csv");
fs.writeFileSync(summaryPath, summaryRows.join("\n"), "utf-8");
console.log("Concluído. Saída em:", OUT_DIR);
console.log("Arquivo resumo:", summaryPath);
}

main().catch((e) => {
console.error("Erro fatal:", e);
process.exit(1);
});
162 changes: 162 additions & 0 deletions ScriptFloraBrasil/jsons_to_xlsx.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// jsons_to_xlsx.js
import fs from "fs";
import path from "path";
import { globSync } from "glob";
import XLSX from "xlsx";

const OUT_DIR = path.resolve(process.cwd(), "output");
const GLOB_PATTERN = path.join(OUT_DIR, "*.json");
const REPORT_PATH = path.join(OUT_DIR, "report.xlsx");

function safeString(v) {
if (v === undefined || v === null) return "";
if (typeof v === "string") return v.trim();
return String(v);
}

function parseNameFromFilename(filename) {
const base = path.basename(filename, path.extname(filename));
const cleaned = base.replace(/_\d+$/, "");
const parts = cleaned.replace(/[_\-]+/g, " ").split(/\s+/).filter(Boolean);
if (parts.length >= 2) {
return { genero: parts[0], especie: parts[1] };
}
return { genero: "", especie: "" };
}

function readJsonFile(filePath) {
try {
const raw = fs.readFileSync(filePath, "utf-8");
return JSON.parse(raw);
} catch (err) {
console.error("Erro lendo/parsing JSON:", filePath, err.message);
return null;
}
}

function extractAuthorsFromString(authStr) {
if (!authStr) return [];
// remover tags html e -- opcional: manter o texto simples
let s = authStr.replace(/<\/?[^>]+(>|$)/g, " ");
// remover parênteses em excesso (mas preserva conteúdo dentro se for autor)
s = s.replace(/[()]/g, " ");
// split por separadores comuns: comma, semicolon, ampersand, " and ", " e ", slash
const parts = s.split(/,|;|&|\band\b|\be\b|\//i).map(p => p.trim()).filter(Boolean);
// normalizar espaços e remover duplicatas locais
const cleaned = parts.map(p => p.replace(/\s+/g, " ").trim());
return cleaned;
}

function normalizeRecord(filePath, data) {
let taxon = null;

if (Array.isArray(data) && data[0]?.taxon) {
taxon = data[0].taxon;
} else if (data?.taxon) {
taxon = data.taxon;
} else if (Array.isArray(data) && data.length > 0 && typeof data[0] === "object" && data[0].taxon === undefined) {
taxon = data[0];
}

let genero = taxon?.genus || "";
let especie = taxon?.specificepithet || "";

if (!genero || !especie) {
const parsed = parseNameFromFilename(filePath);
genero = genero || parsed.genero;
especie = especie || parsed.especie;
}

const nomenclaturalstatus = safeString(taxon?.nomenclaturalstatus);
const taxonomicstatus = safeString(taxon?.taxonomicstatus);
const scientificnameauthorship = safeString(taxon?.scientificnameauthorship);
const scientificname = safeString(taxon?.scientificname);
const taxonid = taxon?.taxonid ?? "";

return {
genero,
especie,
scientificname,
taxonid,
nomenclaturalstatus,
taxonomicstatus,
scientificnameauthorship,
source_file: path.basename(filePath),
};
}

function gatherAllAuthors(rows) {
const set = new Set();
for (const r of rows) {
const authField = r.scientificnameauthorship || "";
const parts = extractAuthorsFromString(authField);
for (const p of parts) {
if (p) set.add(p);
}

const sn = r.scientificname || "";
if (sn) {
const m = sn.match(/^[A-Z][a-zA-Z-]+(?:\s+[a-z-]+)?\s+[a-z-]+(.*)$/);
if (m && m[1]) {
const tail = m[1].replace(/[()]/g, " ").trim();
const tailParts = extractAuthorsFromString(tail);
for (const p of tailParts) if (p) set.add(p);
}
}
}

return Array.from(set).sort((a, b) => a.localeCompare(b, "pt-BR"));
}

function main() {
const files = globSync(GLOB_PATTERN);
if (!files.length) {
console.error("Nenhum arquivo JSON encontrado em:", OUT_DIR);
process.exit(1);
}
console.log(`Arquivos JSON encontrados: ${files.length}`);

const rows = [];
for (const f of files) {
const data = readJsonFile(f);
if (!data) {
rows.push({
genero: "",
especie: "",
scientificname: "",
taxonid: "",
nomenclaturalstatus: "",
taxonomicstatus: "",
scientificnameauthorship: "",
source_file: path.basename(f),
});
continue;
}
const rec = normalizeRecord(f, data);
rows.push(rec);
}

const header = [
"genero",
"especie",
"scientificname",
"taxonid",
"nomenclaturalstatus",
"taxonomicstatus",
"scientificnameauthorship",
"source_file"
];
const worksheet = XLSX.utils.json_to_sheet(rows, { header });
const workbook = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(workbook, worksheet, "Taxa");

const authors = gatherAllAuthors(rows);
const authorsRows = authors.map(a => ({ author: a }));
const wsAuth = XLSX.utils.json_to_sheet(authorsRows, { header: ["author"] });
XLSX.utils.book_append_sheet(workbook, wsAuth, "Authors");

XLSX.writeFile(workbook, REPORT_PATH);
console.log("Planilha gerada em:", REPORT_PATH);
}

main();
Loading