diff --git a/getcomputo-pub.fsx b/getcomputo-pub.fsx index dce48c1..237fd21 100644 --- a/getcomputo-pub.fsx +++ b/getcomputo-pub.fsx @@ -76,7 +76,7 @@ type RepoError = let redirectStringRe = Regex(@"URL='(.*)'") -let getAbstract (page: string) = +let getBibTeX (page: string) = let htmlFirst = HtmlDocument.Load(page) @@ -100,16 +100,29 @@ let getAbstract (page: string) = try html.CssSelect(".bibtex").Head.InnerText() |> DirtyParser.bibTeXFromString - |> _.Head.Properties["abstract"] + |> _.Head |> Result.Ok with e -> - printfn "Error getting abstract from %s: %s" page e.Message + printfn "Error getting BibTeX from %s: %s" page e.Message Result.Error e.Message +let getAbstract (entry: BibTeXEntry) = entry.Properties["abstract"] + +let getBibTeXFromDict (d: Dictionary) = + d["repoObj"] :?> Repository + |> _.Homepage + |> getBibTeX + |> function + | Ok a -> DrBiber.DirtyParser.bibTeXToString [ a ] + | Error e -> + printfn "Error getting BibTeX from %s: %s" (d["repoObj"] :?> Repository).Name e + "" + let getAbstractFromDict (d: Dictionary) = d["repoObj"] :?> Repository |> _.Homepage - |> getAbstract + |> getBibTeX + |> Result.map (fun bibTeX -> getAbstract bibTeX) |> function | Ok a -> a | Error e -> @@ -159,6 +172,7 @@ let extractCitation (d: Dictionary) = description = d |> getSomeString "description" abstract' = d |> getAbstractFromDict repo = d |> getSomeString "repo" + bibtex = d |> getBibTeXFromDict pdf = d |> getAnotherThing "citation" |> getSomeString "pdf-url" url = d |> getAnotherThing "citation" |> getSomeString "url" draft = d |> getSomeString "draft" |} diff --git a/site/mock-papers.yml b/site/mock-papers.yml index c2c2964..c9915cd 100644 --- a/site/mock-papers.yml +++ b/site/mock-papers.yml @@ -20,6 +20,39 @@ t-SNE are significantly better than those produced by other techniques on almost all of the data sets. authors: Laurens van der Maaten and Geoffrey Hinton + bibtex: >+ + @article{van_der_maaten2008, + author = {van der Maaten, Laurens and Hinton, Geoffrey}, + publisher = {Société Française de Statistique}, + title = {Visualizing {Data} Using {t-SNE:} A Practical Computo Example + (Mock)}, + journal = {Computo}, + date = {2008-08-11}, + url = {https://computo.sfds.asso.fr/published-paper-tsne}, + issn = {2824-7795}, + langid = {en}, + abstract = {We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embedding {[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets.} + } + date: 2008-08-11 description: > This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. @@ -29,7 +62,7 @@ pdf: '' repo: published-paper-tsne title: Visualizing Data using t-SNE (mock contributon) - url: https://computo.sfds.asso.fr/published-paper-tsne + url: https://computo-journal.org/published-paper-tsne year: 2008 - abstract': >- We present a new technique called “t-SNE” that visualizes @@ -53,6 +86,39 @@ t-SNE are significantly better than those produced by other techniques on almost all of the data sets. authors: Laurens van der Maaten and Geoffrey Hinton + bibtex: >+ + @article{van_der_maaten2008, + author = {van der Maaten, Laurens and Hinton, Geoffrey}, + publisher = {French Statistical Society}, + title = {Visualizing {Data} Using {t-SNE:} A Practical {Computo} + Example (Mock)}, + journal = {Computo}, + date = {2008-08-11}, + url = {https://computo-journal.org/published-paper-tsne-R}, + issn = {2824-7795}, + langid = {en}, + abstract = {We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embeddi{[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets.} + } + date: 2008-08-11 description: > This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. @@ -62,5 +128,5 @@ pdf: '' repo: published-paper-tsne-R title: Visualizing Data using t-SNE (mock contributon) - url: https://computo.sfds.asso.fr/published-paper-tsne-R + url: https://computo-journal.org/published-paper-tsne-R year: 2008 diff --git a/site/publications.ejs b/site/publications.ejs index b08574a..a884dba 100644 --- a/site/publications.ejs +++ b/site/publications.ejs @@ -10,6 +10,7 @@ for (const item of items) { let doiurl = item.url === "" ? "https://doi.org/" + item.doi : item.url; let bibtitle = item.title.replace(/'/g, "\\'"); let bibauthors = item.authors.replace(/'/g, "\\'"); + let bibtex = item.bibtex.replace(/'/g, "\\'").replace(/\"/g, '"').replace(/\r?\n/g, "\\n"); if (item.year !== currentYear) { if (currentYear !== null) { %> @@ -52,7 +53,7 @@ for (const item of items) { PDF <% } %> --> Sources (Git) - + @@ -124,19 +125,7 @@ for (const item of items) { $('#abstractContent').text(abstract); }); - function generateBibTex(title, authors, year, url) { - // Generate a simple BibTeX key from the first author's last name and year - var firstAuthor = authors.split(' and ')[0].split(' ').pop().toLowerCase(); - var bibKey = firstAuthor + year; - - var bibTeX = '@article{' + bibKey + ',\n' + - ' title={' + title + '},\n' + - ' author={' + authors + '},\n' + - ' journal={Computo},\n' + - ' year={' + year + '},\n' + - ' url={' + url + '}\n' + - '}'; - + function generateBibTex(bibTeX) { // Create a temporary textarea to copy the BibTeX to clipboard var textArea = document.createElement('textarea'); textArea.value = bibTeX; diff --git a/site/published.yml b/site/published.yml index fae667b..c2d8746 100644 --- a/site/published.yml +++ b/site/published.yml @@ -8,6 +8,27 @@ data and data from an ultra running race, where the method yields excellent clustering and variable selection performance. authors: Julien Jacques and Thomas Brendan Murphy + bibtex: >+ + @article{jacques2025, + author = {Jacques, Julien and Brendan Murphy, Thomas}, + publisher = {French Statistical Society}, + title = {Model-Based {Clustering} and {Variable} {Selection} for + {Multivariate} {Count} {Data}}, + journal = {Computo}, + date = {2025-07-01}, + doi = {10.57750/6v7b-8483}, + issn = {2824-7795}, + langid = {en}, + abstract = {Model-based clustering provides a principled way of + developing clustering methods. We develop a new model-based + clustering methods for count data. The method combines clustering + and variable selection for improved clustering. The method is based + on conditionally independent Poisson mixture models and Poisson + generalized linear models. The method is demonstrated on simulated + data and data from an ultra running race, where the method yields + excellent clustering and variable selection performance.} + } + date: 2025-07-01 description: '' doi: 10.57750/6v7b-8483 @@ -40,6 +61,41 @@ hospitalizations at Bordeaux University Hospital using public data and electronic health records. authors: Thomas Ferté, Kalidou Ba, Dan Dutartre, Pierrick Legrand, Vianney Jouhet, Rodolphe Thiébaut, Xavier Hinaut and Boris P Hejblum + bibtex: >+ + @article{ferté2025, + author = {Ferté, Thomas and Ba, Kalidou and Dutartre, Dan and Legrand, + Pierrick and Jouhet, Vianney and Thiébaut, Rodolphe and Hinaut, + Xavier and P Hejblum, Boris}, + publisher = {French Statistical Society}, + title = {Reservoir {Computing} in {R:} A {Tutorial} for {Using} + Reservoirnet to {Predict} {Complex} {Time-Series}}, + journal = {Computo}, + date = {2025-06-27}, + doi = {10.57750/arxn-6z34}, + issn = {2824-7795}, + langid = {en}, + abstract = {Reservoir Computing (RC) is a machine learning method + based on neural networks that efficiently process information + generated by dynamical systems. It has been successful in solving + various tasks including time series forecasting, language processing + or voice processing. RC is implemented in `Python` and `Julia` but + not in `R`. This article introduces `reservoirnet`, an `R` package + providing access to the `Python` API `ReservoirPy`, allowing `R` + users to harness the power of reservoir computing. This article + provides an introduction to the fundamentals of RC and showcases its + real-world applicability through three distinct sections. First, we + cover the foundational concepts of RC, setting the stage for + understanding its capabilities. Next, we delve into the practical + usage of `reservoirnet` through two illustrative examples. These + examples demonstrate how it can be applied to real-world problems, + specifically, regression of COVID-19 hospitalizations and + classification of Japanese vowels. Finally, we present a + comprehensive analysis of a real-world application of + `reservoirnet`, where it was used to forecast COVID-19 + hospitalizations at Bordeaux University Hospital using public data + and electronic health records.} + } + date: 2025-06-27 description: '' doi: 10.57750/arxn-6z34 @@ -67,6 +123,33 @@ efficient algorithms based on thinning methods, which are compiled using the `Rcpp` package while providing a user-friendly interface. authors: Daphné Giorgi, Sarah Kaakai and Vincent Lemaire + bibtex: >+ + @article{giorgi2025, + author = {Giorgi, Daphné and Kaakai, Sarah and Lemaire, Vincent}, + publisher = {French Statistical Society}, + title = {Efficient Simulation of Individual-Based Population Models}, + journal = {Computo}, + date = {2025-01-27}, + doi = {10.57750/sfxn-1t05}, + issn = {2824-7795}, + langid = {en}, + abstract = {The `R` Package `IBMPopSim` facilitates the simulation of + the random evolution of heterogeneous populations using stochastic + Individual-Based Models (IBMs). The package enables users to + simulate population evolution, in which individuals are + characterized by their age and some characteristics, and the + population is modified by different types of events, including + births/arrivals, death/exit events, or changes of characteristics. + The frequency at which an event can occur to an individual can + depend on their age and characteristics, but also on the + characteristics of other individuals (interactions). Such models + have a wide range of applications in fields including actuarial + science, biology, ecology or epidemiology. `IBMPopSim` overcomes the + limitations of time-consuming IBMs simulations by implementing new + efficient algorithms based on thinning methods, which are compiled + using the `Rcpp` package while providing a user-friendly interface.} + } + date: 2025-01-27 description: > This document provides a full description of the Stochastic Individual-Based Models (IBMs) that can be implemented in the IBMPopSim package. A unified mathematical and simulation framework is given, with a detailed description of the simulation algorithm. Examples of applications for the package are also provided, showing the performance and flexibility of IBMPopSim. @@ -97,6 +180,35 @@ \textless https://github.com/cambroise/spectral-bridges-Rpackage\textgreater). authors: Félix Laplante and Christophe Ambroise + bibtex: >+ + @article{laplante2024, + author = {Laplante, Félix and Ambroise, Christophe}, + publisher = {French Statistical Society}, + title = {Spectral {Bridges}}, + journal = {Computo}, + date = {2024-12-13}, + doi = {10.57750/1gr8-bk61}, + issn = {2824-7795}, + langid = {en}, + abstract = {In this paper, Spectral Bridges, a novel clustering + algorithm, is introduced. This algorithm builds upon the traditional + k-means and spectral clustering frameworks by subdividing data into + small Voronoï regions, which are subsequently merged according to a + connectivity measure. Drawing inspiration from Support Vector + Machine’s margin concept, a non-parametric clustering approach is + proposed, building an affinity margin between each pair of Voronoï + regions. This approach delineates intricate, non-convex cluster + structures and is robust to hyperparameter choice. The numerical + experiments underscore Spectral Bridges as a fast, robust, and + versatile tool for clustering tasks spanning diverse domains. Its + efficacy extends to large-scale scenarios encompassing both + real-world and synthetic datasets. The Spectral Bridge algorithm is + implemented both in Python (\textless + https://pypi.org/project/spectral-bridges\textgreater) and R + \textless + https://github.com/cambroise/spectral-bridges-Rpackage\textgreater).} + } + date: 2024-12-13 description: Scalable Spectral Clustering Based on Vector Quantization doi: 10.57750/1gr8-bk61 @@ -126,6 +238,36 @@ tools for visualizing and summarizing conformal prediction intervals. authors: Herbert Susmann, Antoine Chambaz and Julie Josse + bibtex: >+ + @article{susmann2024, + author = {Susmann, Herbert and Chambaz, Antoine and Josse, Julie}, + publisher = {French Statistical Society}, + title = {AdaptiveConformal: {An} {`R`} {Package} for {Adaptive} + {Conformal} {Inference}}, + journal = {Computo}, + date = {2024-07-18}, + doi = {10.57750/edan-5f53}, + issn = {2824-7795}, + langid = {en}, + abstract = {Conformal Inference (CI) is a popular approach for + generating finite sample prediction intervals based on the output of + any point prediction method when data are exchangeable. Adaptive + Conformal Inference (ACI) algorithms extend CI to the case of + sequentially observed data, such as time series, and exhibit strong + theoretical guarantees without having to assume exchangeability of + the observed data. The common thread that unites algorithms in the + ACI family is that they adaptively adjust the width of the generated + prediction intervals in response to the observed data. We provide a + detailed description of five ACI algorithms and their theoretical + guarantees, and test their performance in simulation studies. We + then present a case study of producing prediction intervals for + influenza incidence in the United States based on black-box point + forecasts. Implementations of all the algorithms are released as an + open-source `R` package, `AdaptiveConformal`, which also includes + tools for visualizing and summarizing conformal prediction + intervals.} + } + date: 2024-07-18 description: '' doi: 10.57750/edan-5f53 @@ -167,6 +309,49 @@ implementing the Bayesian estimation of marked log-Gaussian Cox processes using the R-INLA package of the R statistical software. authors: Juliette Legrand, François Pimont, Jean-Luc Dupuy and Thomas Opitz + bibtex: >+ + @article{legrand2024, + author = {Legrand, Juliette and Pimont, François and Dupuy, Jean-Luc + and Opitz, Thomas}, + publisher = {French Statistical Society}, + title = {Bayesian Spatiotemporal Modelling of Wildfire Occurrences and + Sizes for Projections Under Climate Change}, + journal = {Computo}, + date = {2024-07-12}, + doi = {10.57750/4y84-4t68}, + issn = {2824-7795}, + langid = {en}, + abstract = {Appropriate spatiotemporal modelling of wildfire activity + is crucial for its prediction and risk management. Here, we focus on + wildfire risk in the Aquitaine region in the Southwest of France and + its projection under climate change. We study whether wildfire risk + could further increase under climate change in this specific region, + which does not lie in the historical core area of wildfires in + Southeastern France, corresponding to the Southwest. For this + purpose, we consider a marked spatiotemporal point process, a + flexible model for occurrences and magnitudes of such environmental + risks, where the magnitudes are defined as the burnt areas. The + model is first calibrated using 14 years of past observation data of + wildfire occurrences and weather variables, and then applied for + projection of climate-change impacts using simulations of numerical + climate models until 2100 as new inputs. We work within the + framework of a spatiotemporal Bayesian hierarchical model, and we + present the workflow of its implementation for a large dataset at + daily resolution for 8km-pixels using the INLA-SPDE approach. The + assessment of the posterior distributions shows a satisfactory fit + of the model for the observation period. We stochastically simulate + projections of future wildfire activity by combining climate model + output with posterior simulations of model parameters. Depending on + climate models, spline-smoothed projections indicate low to moderate + increase of wildfire activity under climate change. The increase is + weaker than in the historical core area, which we attribute to + different weather conditions (oceanic versus Mediterranean). Besides + providing a relevant case study of environmental risk modelling, + this paper is also intended to provide a full workflow for + implementing the Bayesian estimation of marked log-Gaussian Cox + processes using the R-INLA package of the R statistical software.} + } + date: 2024-07-12 description: '' doi: 10.57750/4y84-4t68 @@ -199,6 +384,40 @@ inequality-based approaches in particular when the underlying number of changes is small compared to the data length. authors: Liudmila Pishchagina, Guillem Rigaill and Vincent Runge + bibtex: >+ + @article{pishchagina2024, + author = {Pishchagina, Liudmila and Rigaill, Guillem and Runge, + Vincent}, + publisher = {French Statistical Society}, + title = {Geometric-Based {Pruning} {Rules} for {Change} {Point} + {Detection} in {Multiple} {Independent} {Time} {Series}}, + journal = {Computo}, + date = {2024-07-12}, + doi = {10.57750/9vvx-eq57}, + issn = {2824-7795}, + langid = {en}, + abstract = {We address the challenge of identifying multiple change + points in a group of independent time series, assuming these change + points occur simultaneously in all series and their number is + unknown. The search for the best segmentation can be expressed as a + minimization problem over a given cost function. We focus on dynamic + programming algorithms that solve this problem exactly. When the + number of changes is proportional to data length, an + inequality-based pruning rule encoded in the PELT algorithm leads to + a linear time complexity. Another type of pruning, called functional + pruning, gives a close-to-linear time complexity whatever the number + of changes, but only for the analysis of univariate time series. We + propose a few extensions of functional pruning for multiple + independent time series based on the use of simple geometric shapes + (balls and hyperrectangles). We focus on the Gaussian case, but some + of our rules can be easily extended to the exponential family. In a + simulation study we compare the computational efficiency of + different geometric-based pruning rules. We show that for a small + number of time series some of them ran significantly faster than + inequality-based approaches in particular when the underlying number + of changes is small compared to the data length.} + } + date: 2024-07-12 description: '' doi: 10.57750/9vvx-eq57 @@ -221,6 +440,30 @@ addition, we provide an identification module to easily explore the task difficulty of datasets and worker capabilities. authors: Tanguy Lefort, Benjamin Charlier, Alexis Joly and Joseph Salmon + bibtex: >+ + @article{lefort2024, + author = {Lefort, Tanguy and Charlier, Benjamin and Joly, Alexis and + Salmon, Joseph}, + publisher = {French Statistical Society}, + title = {Peerannot: Classification for Crowdsourced Image Datasets + with {Python}}, + journal = {Computo}, + date = {2024-05-07}, + doi = {10.57750/qmaz-gr91}, + issn = {2824-7795}, + langid = {en}, + abstract = {Crowdsourcing is a quick and easy way to collect labels + for large datasets, involving many workers. However, workers often + disagree with each other. Sources of error can arise from the + workers’ skills, but also from the intrinsic difficulty of the task. + We present `peerannot`: a `Python` library for managing and learning + from crowdsourced labels for classification. Our library allows + users to aggregate labels from common noise models or train a deep + learning-based classifier directly from crowdsourced labels. In + addition, we provide an identification module to easily explore the + task difficulty of datasets and worker capabilities.} + } + date: 2024-05-07 description: > Crowdsourcing is a quick and easy way to collect labels for large datasets, involving many workers. @@ -255,6 +498,34 @@ generalizations, in particular the incorporation of such ideas in adaptive importance sampling schemes. authors: Maxime El Masri, Jérôme Morio and Florian Simatos + bibtex: >+ + @article{el_masri2024, + author = {El Masri, Maxime and Morio, Jérôme and Simatos, Florian}, + publisher = {French Statistical Society}, + title = {Optimal Projection for Parametric Importance Sampling in High + Dimensions}, + journal = {Computo}, + date = {2024-03-11}, + doi = {10.57750/jjza-6j82}, + issn = {2824-7795}, + langid = {en}, + abstract = {We propose a dimension reduction strategy in order to + improve the performance of importance sampling in high dimensions. + The idea is to estimate variance terms in a small number of suitably + chosen directions. We first prove that the optimal directions, i.e., + the ones that minimize the Kullback-\/-Leibler divergence with the + optimal auxiliary density, are the eigenvectors associated with + extreme (small or large) eigenvalues of the optimal covariance + matrix. We then perform extensive numerical experiments showing that + as dimension increases, these directions give estimations which are + very close to optimal. Moreover, we demonstrate that the estimation + remains accurate even when a simple empirical estimator of the + covariance matrix is used to compute these directions. The + theoretical and numerical results open the way for different + generalizations, in particular the incorporation of such ideas in + adaptive importance sampling schemes.} + } + date: 2024-03-11 description: > This document provides a dimension-reduction strategy in order to improve the performance of importance sampling in high dimensions. @@ -275,6 +546,25 @@ data we are given, we can retrieve some repulsiveness between antennas, which was expected for engineering reasons. authors: Hamza Adrat and Laurent Decreusefond + bibtex: >+ + @article{adrat2024, + author = {Adrat, Hamza and Decreusefond, Laurent}, + publisher = {French Statistical Society}, + title = {Point {Process} {Discrimination} {According} to {Repulsion}}, + journal = {Computo}, + date = {2024-01-25}, + doi = {10.57750/3r07-aw28}, + issn = {2824-7795}, + langid = {en}, + abstract = {In numerous applications, cloud of points do seem to + exhibit *repulsion* in the intuitive sense that there is no local + cluster as in a Poisson process. Motivated by data coming from + cellular networks, we devise a classification algorithm based on the + form of the Voronoi cells. We show that, in the particular set of + data we are given, we can retrieve some repulsiveness between + antennas, which was expected for engineering reasons.} + } + date: 2024-01-25 description: '' doi: 10.57750/3r07-aw28 @@ -329,6 +619,61 @@ analyze pest surveys and field experiments conducted to assess the efficacy of pest treatments. authors: Armand Favrot and David Makowski + bibtex: >+ + @article{favrot2024, + author = {Favrot, Armand and Makowski, David}, + publisher = {French Statistical Society}, + title = {A Hierarchical Model to Evaluate Pest Treatments from + Prevalence and Intensity Data}, + journal = {Computo}, + date = {2024-01-09}, + doi = {10.57750/6cgk-g727}, + issn = {2824-7795}, + langid = {en}, + abstract = {In plant epidemiology, pest abundance is measured in field + trials using metrics assessing either pest prevalence (fraction of + the plant population infected) or pest intensity (average number of + pest individuals present in infected plants). Some of these trials + rely on prevalence, while others rely on intensity, depending on the + protocols. In this paper, we present a hierarchical Bayesian model + able to handle both types of data. In this model, the intensity and + prevalence variables are derived from a latent variable representing + the number of pest individuals on each host individual, assumed to + follow a Poisson distribution. Effects of pest treaments, time + trend, and between-trial variability are described using fixed and + random effects. We apply the model to a real data set in the context + of aphid control in sugar beet fields. In this data set, prevalence + and intensity were derived from aphid counts observed on either + factorial trials testing different types of pesticides treatments or + field surveys monitoring aphid abundance. Next, we perform + simulations to assess the impacts of using either prevalence or + intensity data, or both types of data simultaneously, on the + accuracy of the model parameter estimates and on the ranking of + pesticide treatment efficacy. Our results show that, when pest + prevalence and pest intensity data are collected separately in + different trials, the model parameters are more accurately estimated + using both types of trials than using one type of trials only. When + prevalence data are collected in all trials and intensity data are + collected in a subset of trials, estimations and pest treatment + ranking are more accurate using both types of data than using + prevalence data only. When only one type of observation can be + collected in a pest survey or in an experimental trial, our analysis + indicates that it is better to collect intensity data than + prevalence data when all or most of the plants are expected to be + infested, but that both types of data lead to similar results when + the level of infestation is low to moderate. Finally, our + simulations show that it is unlikely to obtain accurate results with + fewer than 40 trials when assessing the efficacy of pest control + treatments based on prevalence and intensity data. Because of its + flexibility, our model can be used to evaluate and rank the efficacy + of pest treatments using either prevalence or intensity data, or + both types of data simultaneously. As it can be easily implemented + using standard Bayesian packages, we hope that it will be useful to + agronomists, plant pathologists, and applied statisticians to + analyze pest surveys and field experiments conducted to assess the + efficacy of pest treatments.} + } + date: 2024-01-09 description: '' doi: 10.57750/6cgk-g727 @@ -364,6 +709,42 @@ second, more local, RF. Unfortunately, these approaches, although interesting, do not provide conclusive results. authors: Alice Cleynen, Louis Raynal and Jean-Michel Marin + bibtex: >+ + @article{cleynen2023, + author = {Cleynen, Alice and Raynal, Louis and Marin, Jean-Michel}, + publisher = {French Statistical Society}, + title = {Local Tree Methods for Classification: A Review and Some Dead + Ends}, + journal = {Computo}, + date = {2023-12-14}, + doi = {10.57750/3j8m-8d57}, + issn = {2824-7795}, + langid = {en}, + abstract = {Random Forests (RF) {[}@breiman:2001{]} are very popular + machine learning methods. They perform well even with little or no + tuning, and have some theoretical guarantees, especially for sparse + problems {[}@biau:2012;@scornet:etal:2015{]}. These learning + strategies have been used in several contexts, also outside the + field of classification and regression. To perform Bayesian model + selection in the case of intractable likelihoods, the ABC Random + Forests (ABC-RF) strategy of @pudlo:etal:2016 consists in applying + Random Forests on training sets composed of simulations coming from + the Bayesian generative models. The ABC-RF technique is based on an + underlying RF for which the training and prediction phases are + separated. The training phase does not take into account the data to + be predicted. This seems to be suboptimal as in the ABC framework + only one observation is of interest for the prediction. In this + paper, we study tree-based methods that are built to predict a + specific instance in a classification setting. This type of methods + falls within the scope of local (lazy/instance-based/case specific) + classification learning. We review some existing strategies and + propose two new ones. The first consists in modifying the tree + splitting rule by using kernels, the second in using a first RF to + compute some local variable importance that is used to train a + second, more local, RF. Unfortunately, these approaches, although + interesting, do not provide conclusive results.} + } + date: 2023-12-14 description: '' doi: 10.57750/3j8m-8d57 @@ -396,6 +777,39 @@ convergence properties of the estimation algorithm through simulation studies. authors: Maud Delattre and Estelle Kuhn + bibtex: >+ + @article{delattre2023, + author = {Delattre, Maud and Kuhn, Estelle}, + publisher = {French Statistical Society}, + title = {Computing an Empirical {Fisher} Information Matrix Estimate + in Latent Variable Models Through Stochastic Approximation}, + journal = {Computo}, + date = {2023-11-21}, + doi = {10.57750/r5gx-jk62}, + issn = {2824-7795}, + langid = {en}, + abstract = {The Fisher information matrix (FIM) is a key quantity in + statistics. However its exact computation is often not trivial. In + particular in many latent variable models, it is intricated due to + the presence of unobserved variables. Several methods have been + proposed to approximate the FIM when it can not be evaluated + analytically. Different estimates have been considered, in + particular moment estimates. However some of them require to compute + second derivatives of the complete data log-likelihood which leads + to some disadvantages. In this paper, we focus on the empirical + Fisher information matrix defined as an empirical estimate of the + covariance matrix of the score, which only requires to compute the + first derivatives of the log-likelihood. Our contribution consists + in presenting a new numerical method to evaluate this empirical + Fisher information matrix in latent variable model when the proposed + estimate can not be directly analytically evaluated. We propose a + stochastic approximation estimation algorithm to compute this + estimate as a by-product of the parameter estimate. We evaluate the + finite sample size properties of the proposed estimate and the + convergence properties of the estimation algorithm through + simulation studies.} + } + date: 2023-11-21 description: '' doi: 10.57750/r5gx-jk62 @@ -430,6 +844,40 @@ network inference models. Applications to gut microbiome data and poplar’s methylation mixed with transcriptomic data are presented. authors: Edmond Sanou, Christophe Ambroise and Geneviève Robin + bibtex: >+ + @article{sanou2023, + author = {Sanou, Edmond and Ambroise, Christophe and Robin, Geneviève}, + publisher = {French Statistical Society}, + title = {Inference of {Multiscale} {Gaussian} {Graphical} {Models}}, + journal = {Computo}, + date = {2023-06-28}, + doi = {10.57750/1f4p-7955}, + issn = {2824-7795}, + langid = {en}, + abstract = {Gaussian Graphical Models (GGMs) are widely used in + high-dimensional data analysis to synthesize the interaction between + variables. In many applications, such as genomics or image analysis, + graphical models rely on sparsity and clustering to reduce + dimensionality and improve performances. This paper explores a + slightly different paradigm where clustering is not knowledge-driven + but performed simultaneously with the graph inference task. We + introduce a novel Multiscale Graphical Lasso (MGLasso) to improve + networks interpretability by proposing graphs at different + granularity levels. The method estimates clusters through a convex + clustering approach -\/-\/- a relaxation of \$k\$-means, and + hierarchical clustering. The conditional independence graph is + simultaneously inferred through a neighborhood selection scheme for + undirected graphical models. MGLasso extends and generalizes the + sparse group fused lasso problem to undirected graphical models. We + use continuation with Nesterov smoothing in a shrinkage-thresholding + algorithm (CONESTA) to propose a regularization path of solutions + along the group fused Lasso penalty, while the Lasso penalty is kept + constant. Extensive experiments on synthetic data compare the + performances of our model to state-of-the-art clustering methods and + network inference models. Applications to gut microbiome data and + poplar’s methylation mixed with transcriptomic data are presented.} + } + date: 2023-06-28 description: '' doi: 10.57750/1f4p-7955 @@ -461,6 +909,39 @@ detection capabilities. A precise error decomposition allows clear analysis and highlights the remaining challenges. authors: Mathis Chagneux, Sylvain Le Corff, Pierre Gloaguen, Charles Ollion, Océane Lepâtre and Antoine Bruge + bibtex: >+ + @article{chagneux2023, + author = {Chagneux, Mathis and Le Corff, Sylvain and Gloaguen, Pierre + and Ollion, Charles and Lepâtre, Océane and Bruge, Antoine}, + publisher = {French Statistical Society}, + title = {Macrolitter Video Counting on Riverbanks Using State Space + Models and Moving Cameras}, + journal = {Computo}, + date = {2023-02-16}, + doi = {10.57750/845m-f805}, + issn = {2824-7795}, + langid = {en}, + abstract = {Litter is a known cause of degradation in marine + environments and most of it travels in rivers before reaching the + oceans. In this paper, we present a novel algorithm to assist waste + monitoring along watercourses. While several attempts have been made + to quantify litter using neural object detection in photographs of + floating items, we tackle the more challenging task of counting + directly in videos using boat-embedded cameras. We rely on + multi-object tracking (MOT) but focus on the key pitfalls of false + and redundant counts which arise in typical scenarios of poor + detection performance. Our system only requires supervision at the + image level and performs Bayesian filtering via a state space model + based on optical flow. We present a new open image dataset gathered + through a crowdsourced campaign and used to train a center-based + anchor-free object detector. Realistic video footage assembled by + water monitoring experts is annotated and provided for evaluation. + Improvements in count quality are demonstrated against systems built + from state-of-the-art multi-object trackers sharing the same + detection capabilities. A precise error decomposition allows clear + analysis and highlights the remaining challenges.} + } + date: 2023-02-16 description: '' doi: 10.57750/845m-f805 @@ -485,6 +966,30 @@ practitioners working with copulae in \$\textbackslash textsf\{Python\}\$. authors: Alexis Boulin + bibtex: >+ + @article{boulin2023, + author = {Boulin, Alexis}, + publisher = {French Statistical Society}, + title = {A {Python} {Package} for {Sampling} from {Copulae:} Clayton}, + journal = {Computo}, + date = {2023-01-12}, + doi = {10.57750/4szh-t752}, + issn = {2824-7795}, + langid = {en}, + abstract = {The package \$\textbackslash textsf\{clayton\}\$ is + designed to be intuitive, user-friendly, and efficient. It offers a + wide range of copula models, including Archimedean, Elliptical, and + Extreme. The package is implemented in pure \$\textbackslash + textsf\{Python\}\$, making it easy to install and use. In addition, + we provide detailed documentation and examples to help users get + started quickly. We also conduct a performance comparison with + existing \$\textbackslash textsf\{R\}\$ packages, demonstrating the + efficiency of our implementation. The \$\textbackslash + textsf\{clayton\}\$ package is a valuable tool for researchers and + practitioners working with copulae in \$\textbackslash + textsf\{Python\}\$.} + } + date: 2023-01-12 description: > The package $\textsf{clayton}$ is designed to be intuitive, user-friendly, and efficient. It offers a wide range of copula models, including Archimedean, Elliptical, and Extreme. The package is implemented in pure $\textsf{Python}$, making it easy to install and use. @@ -523,6 +1028,46 @@ reproducible workflow will be useful to ecologists and applied statisticians. authors: Olivier Gimenez, Maëlis Kervellec, Jean-Baptiste Fanjul, Anna Chaine, Lucile Marescot, Yoann Bollet and Christophe Duchamp + bibtex: >+ + @article{gimenez2022, + author = {Gimenez, Olivier and Kervellec, Maëlis and Fanjul, + Jean-Baptiste and Chaine, Anna and Marescot, Lucile and Bollet, + Yoann and Duchamp, Christophe}, + publisher = {French Statistical Society}, + title = {Trade-Off Between Deep Learning for Species Identification + and Inference about Predator-Prey Co-Occurrence}, + journal = {Computo}, + date = {2022-04-22}, + doi = {10.57750/yfm2-5f45}, + issn = {2824-7795}, + langid = {en}, + abstract = {Deep learning is used in computer vision problems with + important applications in several scientific fields. In ecology for + example, there is a growing interest in deep learning for + automatizing repetitive analyses on large amounts of images, such as + animal species identification. However, there are challenging issues + toward the wide adoption of deep learning by the community of + ecologists. First, there is a programming barrier as most algorithms + are written in `Python` while most ecologists are versed in `R`. + Second, recent applications of deep learning in ecology have focused + on computational aspects and simple tasks without addressing the + underlying ecological questions or carrying out the statistical data + analysis to answer these questions. Here, we showcase a reproducible + `R` workflow integrating both deep learning and statistical models + using predator-prey relationships as a case study. We illustrate + deep learning for the identification of animal species on images + collected with camera traps, and quantify spatial co-occurrence + using multispecies occupancy models. Despite average model + classification performances, ecological inference was similar + whether we analysed the ground truth dataset or the classified + dataset. This result calls for further work on the trade-offs + between time and resources allocated to train models with deep + learning and our ability to properly address key ecological + questions with biodiversity monitoring. We hope that our + reproducible workflow will be useful to ecologists and applied + statisticians.} + } + date: 2022-04-22 description: '' doi: 10.57750/yfm2-5f45