clowder-framework · MBcode · Oct 14, 2022 · Oct 14, 2022 · Oct 21, 2022 · Oct 27, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ script prior to upgrading to minimize the downtime.
 - In the docker container the folder /home/clowder/data is now whitelisted by default for uploading by reference. 
   This can be changed using the environment variable CLOWDER_SOURCEPATH.
 - The current CLA for developers of clowder.
+- sitemap.xml route to list dataset pages so they can be crawled for thier embedded jsonld, for google dataset search
 
 ### Fixed
 - Send email to all admins in a single email when a user submits 'Request access' for a space

diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala
@@ -18,7 +18,8 @@ import scala.collection.mutable.ListBuffer
 @Singleton
 class Application @Inject()(files: FileService, collections: CollectionService, datasets: DatasetService,
                             spaces: SpaceService, events: EventService, comments: CommentService,
-                            sections: SectionService, users: UserService, selections: SelectionService) extends SecuredController {
+                            sections: SectionService, users: UserService, selections: SelectionService,
+                            tree: TreeService) extends SecuredController {
   /**
    * Redirect any url's that have a trailing /
    *
@@ -84,6 +85,59 @@ class Application @Inject()(files: FileService, collections: CollectionService,
     }
   }
 
+  /**
+   * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts
+   * suggested to start like w/swagger route, but if I don't cache it, then I should change this
+   *  otherwise it will need a filler file there; which I should provide as a cache
+   */
+import play.api.libs.json._  //put at top
+import api.Permission.Permission //put at top
+import models.User
+
+  def sitemap = Action { implicit request =>
+    Play.resource("/public/sitemap.xml") match { //in case we cache it here someday
+      case Some(resource) => {
+        val https = Utils.https(request)
+        val clowderurl = new URL(Utils.baseUrl(request))
+        val host = if (clowderurl.getPort == -1) {
+          clowderurl.getHost
+        } else {
+          clowderurl.getHost + ":" + clowderurl.getPort
+        }
+        val user = User.anonymous //not found: value User
+        //val dd=tree.getDatasets(true,user) //not owned by anon
+        val dd = tree.getDatasets(false,user)
+        var resultStr=""
+        val top= """<?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> """
+        resultStr = resultStr.concat(top)
+        var uStr = ""
+        dd.foreach( dd_ => {
+           val dd_id = (dd_ \ "id").as[String]
+           uStr = "\n<url><loc>" + clowderurl + "/datasets/" + dd_id + "</loc></url>"
+           resultStr = resultStr.concat(uStr)
+        })
+        //was from route
+        //val d = scala.io.Source.fromURL(clowderurl + "/api/datasets")
+        //val sd = d.mkString
+        //val parsedJson = Json.parse(sd)
+        //val idl = (parsedJson \\ "id")
+        //idl.foreach( id => {
+        //   val id_ = id.as[String]
+        //   uStr = "\n<url><loc>" + clowderurl + "/datasets/" + id_ + "</loc></url>"
+        //   resultStr = resultStr.concat(uStr)
+        //})
+        //will rm above once getstatsets
+        resultStr = resultStr +  "\n</urlset>"
+        //could cache, in case we want to reuse later, w/Ok(reult.mkString)
+        //_would again check cache before creating, but still problems w/:
+        //might skip as would have to recheck permissions as well
+        Ok(resultStr.mkString)
+      }
+      case None => NotFound("Could not find sitemap.xml")
+    }
+  }
+
   /**
    * Main page.
    */

diff --git a/conf/routes b/conf/routes
@@ -297,6 +297,12 @@ GET            /javascriptRoutes
 # ----------------------------------------------------------------------
 GET            /swagger                                                                 @controllers.Application.swagger
 GET            /swaggerUI                                                               @controllers.Application.swaggerUI
+
+# ----------------------------------------------------------------------
+# SITEMAP
+# ----------------------------------------------------------------------
+GET            /sitemap.xml                                                             @controllers.Application.sitemap
+GET            /sitemap                                                                 @controllers.Application.sitemap
 
 # ----------------------------------------------------------------------
 # RESTful API

diff --git a/conf/sitemap.xml b/conf/sitemap.xml
@@ -0,0 +1,4 @@
+=placeholder right now:
+Route setup to read from this cached file, so expects it
+ even though the caching hasn't been done yet
+ and right now it is returning it directly
diff --git a/public/sitemap.xml b/public/sitemap.xml
@@ -0,0 +1,6 @@
+filler that will be replaced with cached sitemap
+though that idea might be on hold if we have to
+ worry about who has access to this sitemap
+ as clowder v1 only has public and hidden
+while v2 might get a private setting where
+ you can see that it is there but not download it w/o auth