From 96d2387ea67e755c4becdc18836d4cb29c651a39 Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Thu, 3 Jul 2025 12:08:07 -0400 Subject: [PATCH 1/7] Convert old BagIt profiles to DART settings format and eliminate old BagIt Spec ones --- bagger/profiles/README.md | 23 ++ bagger/profiles/redata-bagit-dart-v3.0.json | 345 ++++++++++++++++++++ profiles/README.md | 7 - profiles/redata-bagit-dart-v2.2.json | 330 ------------------- profiles/redata-bagit-v2.2.json | 82 ----- 5 files changed, 368 insertions(+), 419 deletions(-) create mode 100644 bagger/profiles/README.md create mode 100644 bagger/profiles/redata-bagit-dart-v3.0.json delete mode 100644 profiles/README.md delete mode 100644 profiles/redata-bagit-dart-v2.2.json delete mode 100644 profiles/redata-bagit-v2.2.json diff --git a/bagger/profiles/README.md b/bagger/profiles/README.md new file mode 100644 index 0000000..bf64913 --- /dev/null +++ b/bagger/profiles/README.md @@ -0,0 +1,23 @@ +# BagIt Profile + +This profile is used by ReBACH bagger to generate bags. The profile is in the DART format instead of the BagIt Profile Specification Format as the former is a superset of the latter and can be more easily used in our dart-runner workflow. One notable deficiency in the official specification is that "note that this format cannot describe information about required tags outside of the bag-info.txt file." Refer to the [DART BagIt documentation](https://aptrust.github.io/dart-docs/users/bagit/) for details. + +## Implementation Conventions + +- The format of the file is a DART settings JSON object. The profile itself is contained within the `bagItProfiles` key. +- Each profile is versioned as indicated in the file name and in the `bagItProfileInfo.version` value. +- Prior versions will remain available for checking bags generated with that version. + +## Profile Creation + +To create a profile from scratch, start with the DART GUI. + +- Create a new profile from scratch or by cloning an existing one +- Create a new storage to be used to upload bags to. E.g., Wasabi (optional) +- Export the settings via the Settings -> Export Settings menu. Check only the profile and storage created above. Uncheck any items in the App Settings section +- Save the JSON into a file named `redata-bagit-dart-vXX.json` where XX is the version. + +## Profile Updates + +Profiles can be updated by manually editing the JSON or by importing the `redata-bagit-dart-vXX.json` into DART, editing as needed, and re-exporting. + diff --git a/bagger/profiles/redata-bagit-dart-v3.0.json b/bagger/profiles/redata-bagit-dart-v3.0.json new file mode 100644 index 0000000..b3599ec --- /dev/null +++ b/bagger/profiles/redata-bagit-dart-v3.0.json @@ -0,0 +1,345 @@ +{ + "id": "00000000-0000-0000-0000-000000000000", + "appSettings": [], + "bagItProfiles": [ + { + "id": "7f0cd963-4750-4c10-a3de-0c65bb0d0dc5", + "name": "ReDATA", + "description": "BagIt Profile for ReDATA BagIt preservation using ReBACH (based on APTrust BagIt Profile v2.2)", + "acceptBagItVersion": [ + "0.97", + "1.0" + ], + "acceptSerialization": [ + "application/tar" + ], + "allowFetchTxt": false, + "bagItProfileInfo": { + "bagItProfileIdentifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/bagger/profiles/redata-bagit-dart-v3.0.json", + "bagItProfileVersion": "", + "contactEmail": "redata@arizona.edu", + "contactName": "ReDATA Administrator", + "externalDescription": "Profile for ReDATA content. Based on APTrust BagIt profile v2.2. bagItProfileIdentifier points to a DART settings object which contains this profile.", + "sourceOrganization": "redata.arizona.edu", + "version": "3.0" + }, + "manifestsRequired": [ + "md5" + ], + "manifestsAllowed": [ + "md5", + "sha256" + ], + "tagManifestsRequired": [], + "tagManifestsAllowed": [ + "md5", + "sha256" + ], + "tagFilesAllowed": [ + "*" + ], + "tags": [ + { + "id": "39b8ac8a-8e3d-47c3-9cda-5edd0d4ad1fb", + "tagFile": "bagit.txt", + "tagName": "BagIt-Version", + "required": true, + "values": [ + "0.97", + "1.0" + ], + "defaultValue": "0.97", + "userValue": "", + "help": "Which version of the BagIt specification describes this bag's format?", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": false + }, + { + "id": "2a914ea2-ee3b-4c53-96e1-4f93f641338b", + "tagFile": "bagit.txt", + "tagName": "Tag-File-Character-Encoding", + "required": true, + "values": [ + "UTF-8" + ], + "defaultValue": "UTF-8", + "userValue": "", + "help": "How are this bag's plain-text tag files encoded? (Hint: usually UTF-8)", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": false + }, + { + "id": "567451b6-1f30-4bda-b66b-9a657426d5e5", + "tagFile": "bag-info.txt", + "tagName": "Source-Organization", + "required": true, + "values": [], + "defaultValue": "University of Arizona Libraries", + "userValue": "", + "help": "The name of the organization that produced this bag, or is responsible for its contents.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false + }, + { + "id": "117e46d8-096f-41f1-8c94-7d9202b9477b", + "tagFile": "bag-info.txt", + "tagName": "Bag-Count", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The number of bags that make up this object. Set this only if you are packaging a single object into multiple bags. See https://wiki.aptrust.org/Bagging_specifications for info on naming multi-part APTrust bags.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "41b75504-e54d-49a1-aad4-c8a4921d15ce", + "tagFile": "bag-info.txt", + "tagName": "Bagging-Date", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The date this bag was created. The bagging software should set this automatically.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "4d9e682c-4236-4adf-aaf2-c9d7666e3062", + "tagFile": "bag-info.txt", + "tagName": "Bagging-Software", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The name of the software that created this bag. The bagging software should set this automatically.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "32e69005-4495-452f-8b3d-bef545fca583", + "tagFile": "bag-info.txt", + "tagName": "Bag-Group-Identifier", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "Identifies the logical group or collection to which a bag belongs. Several bags may share the same Bag-Group-Identifier to indicate that they are part of the same logical grouping.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "917fc560-5bd1-4a5b-acb6-b7a4ce749252", + "tagFile": "bag-info.txt", + "tagName": "Internal-Sender-Description", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "A description of the bag's contents for the sender's internal use. This description will appear in the APTrust registry if you do not set the Description tag in the aptrust-info.txt file.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "018c0706-5597-4406-a705-205c608d827f", + "tagFile": "bag-info.txt", + "tagName": "Internal-Sender-Identifier", + "required": true, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "A unique identifier for this bag inside your organization. For ReDATA, it is the DOI.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false + }, + { + "id": "de2c8f3e-fadb-4811-88a2-83aafa44fb50", + "tagFile": "bag-info.txt", + "tagName": "Payload-Oxum", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The number of files and bytes in this bag's payload. This should be calculated and set by the bagging software.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "9b7344ae-9d06-4444-9d8a-dda7e5c2b8dc", + "tagFile": "aptrust-info.txt", + "tagName": "Title", + "required": true, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The title or name of that describes this bag's contents.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": false + }, + { + "id": "60ef466a-6d9c-4825-92cf-e472fb05f3d4", + "tagFile": "aptrust-info.txt", + "tagName": "Access", + "required": true, + "values": [ + "Consortia", + "Institution", + "Restricted" + ], + "defaultValue": "Institution", + "userValue": "Institution", + "help": "Access rights for this bag describe who can see that it exists in the repository.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false + }, + { + "id": "d94d1d47-49cb-4569-8d27-d9ebbf25c9b2", + "tagFile": "aptrust-info.txt", + "tagName": "Description", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The description of the bag that you want to appear in the APTrust registry.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "emptyOk": true + }, + { + "id": "53075007-e6cf-4a18-9b34-caa605ed593f", + "tagFile": "aptrust-info.txt", + "tagName": "Storage-Option", + "required": true, + "values": [ + "Standard", + "Glacier-OH", + "Glacier-OR", + "Glacier-VA", + "Glacier-Deep-OH", + "Glacier-Deep-OR", + "Glacier-Deep-VA", + "Wasabi-VA", + "Wasabi-OR" + ], + "defaultValue": "Glacier-Deep-OR", + "userValue": "", + "help": "How do you want this bag to be stored in APTrust? Standard = S3/Virginia + Glacier/Oregon. Glacier-OH = Glacier-only storage in Ohio. Glacier-OR = Glacier-only storage in Oregon. Glacier-VA = Glacier-only storage in Virginia. Standard storage includes regular 90-day fixity checks. Glacier-only storage is less expensive but excludes fixity checks. File in Glacier-only storage may take up to 24 hours longer to restore and excessive Glacier retrieval may incur additional fees.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false + }, + { + "id": "22f51ea6-de98-494e-a876-dbaeb004cc9f", + "tagFile": "bag-info.txt", + "tagName": "License-Name", + "required": false, + "values": [ + "CC0", + "CC BY 4.0", + "CC BY-NC 4.0", + "CC BY-NC-SA 4.0", + "MIT", + "BSD 3-Clause", + "Apache 2.0", + "GPL 3.0+", + "LGPL-3.0", + "In Copyright" + ], + "defaultValue": "", + "userValue": "", + "help": "The name of the license assigned to the record in ReDATA.", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false + }, + { + "id": "317ede75-de6f-4bd5-9af0-67c2a24f9174", + "tagFile": "bag-info.txt", + "tagName": "External-Identifier", + "required": true, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "Format: {id of item on figshare}-{version}-{first author firstname}-{first author lastname}-{hash}", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false + }, + { + "id": "8a264c2d-9a75-4a07-917e-1763e2b57993", + "tagFile": "bag-info.txt", + "tagName": "Published-Date", + "required": false, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "The date the item was published in ReDATA (Posted Date). Use YYYY-MM-DD format.", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false + } + ], + "serialization": "required", + "baseProfileId": "043f1c22-c9ff-4112-86f8-8f8f1e6a2dca", + "isBuiltIn": false, + "tarDirMustMatchName": true + } + ], + "questions": [], + "remoteRepositories": [], + "storageServices": [ + { + "id": "b8bb7486-247b-4182-930a-f61644f5b95e", + "name": "Wasabi (ReDATA)", + "description": "Wasabi storage for ReDATA. Override necessary settings using ReBACH bagger config.", + "protocol": "s3", + "host": "***Override***", + "port": 0, + "bucket": "***Override***", + "login": "env:WASABI_ACCESS_KEY_ID", + "password": "env:WASABI_SECRET_ACCESS_KEY", + "loginExtra": "", + "allowsUpload": true, + "allowsDownload": true + } + ] +} \ No newline at end of file diff --git a/profiles/README.md b/profiles/README.md deleted file mode 100644 index 4f56bba..0000000 --- a/profiles/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# BagIt Profile -This profile is used by ReBACH to generate bags. The profile is provided in two slightly different formats: - -- BagIt Profile Specification format v1.3.0, `redata-bagit-vxx.json`. This is the standards-compliant version. -- DART format. `redata-bagit-dart-vxx.json`. This is the version actually used by ReBACH (directly embedded in the workflow files in the config directory). - -The DART-formatted version contains additional information that isn't supported by the BagIt Profile Specification such as help text for the DART GUI. Additionally, from the DART documentation: "note that this format [BagIt Profile Specification] cannot describe information about required tags outside of the bag-info.txt file." \ No newline at end of file diff --git a/profiles/redata-bagit-dart-v2.2.json b/profiles/redata-bagit-dart-v2.2.json deleted file mode 100644 index 6d27551..0000000 --- a/profiles/redata-bagit-dart-v2.2.json +++ /dev/null @@ -1,330 +0,0 @@ -{ - "80dda49d-96c9-46dd-91bf-7f57053854d4": { - "id": "80dda49d-96c9-46dd-91bf-7f57053854d4", - "userCanDelete": true, - "required": [ - "name", - "id" - ], - "errors": {}, - "name": "ReDATA BagIt Profile", - "description": "BagIt Profile for ReDATA BagIt preservation using ReBACH (based on APTrust BagIt Profile v2.2)", - "acceptBagItVersion": [ - "0.97", - "1.0" - ], - "acceptSerialization": [ - "application/tar" - ], - "allowFetchTxt": false, - "bagItProfileInfo": { - "bagItProfileIdentifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/profiles/redata-bagit-dart-v2.2.json", - "bagItProfileVersion": "", - "contactEmail": "redata@arizona.edu", - "contactName": "ReDATA Administrator", - "externalDescription": "BagIt profile for creating bags from ReDATA content. Based on APTrust BagIt profile v2.2.", - "sourceOrganization": "redata.arizona.edu", - "version": "2.2" - }, - "manifestsRequired": [ - "md5" - ], - "manifestsAllowed": [ - "md5", - "sha256" - ], - "tagManifestsRequired": [], - "tagManifestsAllowed": [ - "md5", - "sha256" - ], - "tagFilesAllowed": [ - "*" - ], - "tags": [ - { - "id": "39b8ac8a-8e3d-47c3-9cda-5edd0d4ad1fb", - "tagFile": "bagit.txt", - "tagName": "BagIt-Version", - "required": true, - "values": [ - "0.97", - "1.0" - ], - "defaultValue": "0.97", - "userValue": "", - "help": "Which version of the BagIt specification describes this bag's format?", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": false - }, - { - "id": "2a914ea2-ee3b-4c53-96e1-4f93f641338b", - "tagFile": "bagit.txt", - "tagName": "Tag-File-Character-Encoding", - "required": true, - "values": [ - "UTF-8" - ], - "defaultValue": "UTF-8", - "userValue": "", - "help": "How are this bag's plain-text tag files encoded? (Hint: usually UTF-8)", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": false - }, - { - "id": "567451b6-1f30-4bda-b66b-9a657426d5e5", - "tagFile": "bag-info.txt", - "tagName": "Source-Organization", - "required": true, - "values": [], - "defaultValue": "University of Arizona Libraries", - "userValue": "", - "help": "The name of the organization that produced this bag, or is responsible for its contents.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "117e46d8-096f-41f1-8c94-7d9202b9477b", - "tagFile": "bag-info.txt", - "tagName": "Bag-Count", - "required": false, - "values": [], - "defaultValue": "", - "userValue": "", - "help": "The number of bags that make up this object. Set this only if you are packaging a single object into multiple bags. See https://wiki.aptrust.org/Bagging_specifications for info on naming multi-part APTrust bags.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "41b75504-e54d-49a1-aad4-c8a4921d15ce", - "tagFile": "bag-info.txt", - "tagName": "Bagging-Date", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "The date this bag was created. The bagging software should set this automatically.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "4d9e682c-4236-4adf-aaf2-c9d7666e3062", - "tagFile": "bag-info.txt", - "tagName": "Bagging-Software", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "The name of the software that created this bag. The bagging software should set this automatically.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "32e69005-4495-452f-8b3d-bef545fca583", - "tagFile": "bag-info.txt", - "tagName": "Bag-Group-Identifier", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "Identifies the logical group or collection to which a bag belongs. Several bags may share the same Bag-Group-Identifier to indicate that they are part of the same logical grouping.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "917fc560-5bd1-4a5b-acb6-b7a4ce749252", - "tagFile": "bag-info.txt", - "tagName": "Internal-Sender-Description", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "A description of the bag's contents for the sender's internal use. This description will appear in the APTrust registry if you do not set the Description tag in the aptrust-info.txt file.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "018c0706-5597-4406-a705-205c608d827f", - "tagFile": "bag-info.txt", - "tagName": "Internal-Sender-Identifier", - "required": true, - "values": [], - "defaultValue": "", - "userValue": "", - "help": "A unique identifier for this bag inside your organization. For ReDATA, it is the DOI.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "de2c8f3e-fadb-4811-88a2-83aafa44fb50", - "tagFile": "bag-info.txt", - "tagName": "Payload-Oxum", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "The number of files and bytes in this bag's payload. This should be calculated and set by the bagging software.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "9b7344ae-9d06-4444-9d8a-dda7e5c2b8dc", - "tagFile": "aptrust-info.txt", - "tagName": "Title", - "required": true, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "The title or name of that describes this bag's contents.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": false - }, - { - "id": "60ef466a-6d9c-4825-92cf-e472fb05f3d4", - "tagFile": "aptrust-info.txt", - "tagName": "Access", - "required": true, - "values": [ - "Consortia", - "Institution", - "Restricted" - ], - "defaultValue": "Institution", - "userValue": "Institution", - "help": "Access rights for this bag describe who can see that it exists in the repository.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "d94d1d47-49cb-4569-8d27-d9ebbf25c9b2", - "tagFile": "aptrust-info.txt", - "tagName": "Description", - "required": false, - "values": [], - "defaultValue": null, - "userValue": "", - "help": "The description of the bag that you want to appear in the APTrust registry.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {}, - "emptyOk": true - }, - { - "id": "53075007-e6cf-4a18-9b34-caa605ed593f", - "tagFile": "aptrust-info.txt", - "tagName": "Storage-Option", - "required": true, - "values": [ - "Standard", - "Glacier-OH", - "Glacier-OR", - "Glacier-VA", - "Glacier-Deep-OH", - "Glacier-Deep-OR", - "Glacier-Deep-VA", - "Wasabi-VA", - "Wasabi-OR" - ], - "defaultValue": "Glacier-Deep-OR", - "userValue": "", - "help": "How do you want this bag to be stored in APTrust? Standard = S3/Virginia + Glacier/Oregon. Glacier-OH = Glacier-only storage in Ohio. Glacier-OR = Glacier-only storage in Oregon. Glacier-VA = Glacier-only storage in Virginia. Standard storage includes regular 90-day fixity checks. Glacier-only storage is less expensive but excludes fixity checks. File in Glacier-only storage may take up to 24 hours longer to restore and excessive Glacier retrieval may incur additional fees.", - "isBuiltIn": true, - "isUserAddedFile": false, - "isUserAddedTag": false, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "16772027-e33b-416b-9eb1-9d50b02224f7", - "tagFile": "bag-info.txt", - "tagName": "License-Name", - "required": false, - "values": [ - "CC0", - "CC BY 4.0", - "CC BY-NC 4.0", - "CC BY-NC-SA 4.0", - "MIT", - "BSD 3-Clause", - "Apache 2.0", - "GPL 3.0+", - "LGPL-3.0", - "In Copyright" - ], - "defaultValue": "", - "userValue": "", - "help": "The name of the license assigned to the record in ReDATA. For items with multiple licenses,", - "isBuiltIn": false, - "isUserAddedFile": false, - "isUserAddedTag": true, - "wasAddedForJob": false, - "errors": {} - }, - { - "id": "d9044d6b-818c-4c3b-b7ff-ab867c525249", - "tagFile": "bag-info.txt", - "tagName": "Published-Date", - "required": false, - "values": [], - "defaultValue": "", - "userValue": "", - "help": "The date the item was published in ReDATA (Posted Date). Use YYYY-MM-DD format.", - "isBuiltIn": false, - "isUserAddedFile": false, - "isUserAddedTag": true, - "wasAddedForJob": false, - "errors": {} - } - ], - "serialization": "required", - "baseProfileId": "043f1c22-c9ff-4112-86f8-8f8f1e6a2dca", - "isBuiltIn": false, - "tarDirMustMatchName": true - } -} \ No newline at end of file diff --git a/profiles/redata-bagit-v2.2.json b/profiles/redata-bagit-v2.2.json deleted file mode 100644 index b85fb6f..0000000 --- a/profiles/redata-bagit-v2.2.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "Accept-BagIt-Version": [ - "0.97", - "1.0" - ], - "Accept-Serialization": [ - "application/tar" - ], - "Allow-Fetch.txt": false, - "Serialization": "required", - "Manifests-Allowed": [ - "md5", - "sha256" - ], - "Tag-Manifests-Allowed": [ - "md5", - "sha256" - ], - "Manifests-Required": [ - "md5" - ], - "Tag-Manifests-Required": [], - "Tag-Files-Allowed": [ - "*" - ], - "BagIt-Profile-Info": { - "BagIt-Profile-Identifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/profiles/redata-bagit-v2.2.json", - "BagIt-Profile-Version": "", - "Contact-Email": "redata@arizona.edu", - "Contact-Name": "ReDATA Administrator", - "External-Description": "BagIt profile for creating bags from ReDATA content. Based on APTrust BagIt profile v2.2.", - "Source-Organization": "redata.arizona.edu", - "Version": "2.2" - }, - "Bag-Info": { - "Source-Organization": { - "required": true - }, - "Bag-Count": { - "required": false - }, - "Bagging-Date": { - "required": false - }, - "Bagging-Software": { - "required": false - }, - "Bag-Group-Identifier": { - "required": false - }, - "Internal-Sender-Description": { - "required": false - }, - "Internal-Sender-Identifier": { - "required": true - }, - "Payload-Oxum": { - "required": false - }, - "License-Name": { - "required": false, - "values": [ - "CC0", - "CC BY 4.0", - "CC BY-NC 4.0", - "CC BY-NC-SA 4.0", - "MIT", - "BSD 3-Clause", - "Apache 2.0", - "GPL 3.0+", - "LGPL-3.0", - "In Copyright" - ] - }, - "Published-Date": { - "required": false - } - }, - "Tag-Files-Required": [ - "aptrust-info.txt" - ] -} \ No newline at end of file From 5dbf5600e7041e35ec659ffea0445540862379c0 Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Thu, 3 Jul 2025 12:19:37 -0400 Subject: [PATCH 2/7] Update dart runner workflow with new profile --- bagger/config/default_workflow.json | 40 ++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/bagger/config/default_workflow.json b/bagger/config/default_workflow.json index 21883b3..89abfa3 100644 --- a/bagger/config/default_workflow.json +++ b/bagger/config/default_workflow.json @@ -6,14 +6,14 @@ "packagePluginId": "BagIt", "packagePluginName": null, "bagItProfile": { - "id": "80dda49d-96c9-46dd-91bf-7f57053854d4", + "id": "7f0cd963-4750-4c10-a3de-0c65bb0d0dc5", "userCanDelete": true, "required": [ "name", "id" ], "errors": {}, - "name": "ReDATA BagIt Profile", + "name": "ReDATA", "description": "BagIt Profile for ReDATA BagIt preservation using ReBACH (based on APTrust BagIt Profile v2.2)", "acceptBagItVersion": [ "0.97", @@ -24,13 +24,13 @@ ], "allowFetchTxt": false, "bagItProfileInfo": { - "bagItProfileIdentifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/profiles/redata-bagit-dart-v2.2.json", + "bagItProfileIdentifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/bagger/profiles/redata-bagit-dart-v3.0.json", "bagItProfileVersion": "", "contactEmail": "redata@arizona.edu", "contactName": "ReDATA Administrator", - "externalDescription": "BagIt profile for creating bags from ReDATA content. Based on APTrust BagIt profile v2.2.", + "externalDescription": "Profile for ReDATA content. Based on APTrust BagIt profile v2.2. bagItProfileIdentifier points to a DART settings object which contains this profile.", "sourceOrganization": "redata.arizona.edu", - "version": "2.2" + "version": "3.0" }, "manifestsRequired": [ "md5" @@ -106,14 +106,15 @@ "tagName": "Bag-Count", "required": false, "values": [], - "defaultValue": "", + "defaultValue": null, "userValue": "", "help": "The number of bags that make up this object. Set this only if you are packaging a single object into multiple bags. See https://wiki.aptrust.org/Bagging_specifications for info on naming multi-part APTrust bags.", "isBuiltIn": true, "isUserAddedFile": false, "isUserAddedTag": false, "wasAddedForJob": false, - "errors": {} + "errors": {}, + "emptyOk": true }, { "id": "41b75504-e54d-49a1-aad4-c8a4921d15ce", @@ -287,7 +288,7 @@ "errors": {} }, { - "id": "16772027-e33b-416b-9eb1-9d50b02224f7", + "id": "22f51ea6-de98-494e-a876-dbaeb004cc9f", "tagFile": "bag-info.txt", "tagName": "License-Name", "required": false, @@ -305,7 +306,22 @@ ], "defaultValue": "", "userValue": "", - "help": "The name of the license assigned to the record in ReDATA. For items with multiple licenses,", + "help": "The name of the license assigned to the record in ReDATA.", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "317ede75-de6f-4bd5-9af0-67c2a24f9174", + "tagFile": "bag-info.txt", + "tagName": "External-Identifier", + "required": true, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "Format: {id of item on figshare}-{version}-{first author firstname}-{first author lastname}-{hash}", "isBuiltIn": false, "isUserAddedFile": false, "isUserAddedTag": true, @@ -313,7 +329,7 @@ "errors": {} }, { - "id": "d9044d6b-818c-4c3b-b7ff-ab867c525249", + "id": "8a264c2d-9a75-4a07-917e-1763e2b57993", "tagFile": "bag-info.txt", "tagName": "Published-Date", "required": false, @@ -344,8 +360,8 @@ "id" ], "errors": {}, - "name": "Wasabi Main", - "description": "Main Wasabi storage endpoint", + "name": "Wasabi (ReDATA)", + "description": "Wasabi storage for ReDATA. Override necessary settings using ReBACH bagger config.", "protocol": "s3", "host": "***Override***", "port": 0, From 3c6b91fa661cc5e6fba21c78f258ccb5203c89a0 Mon Sep 17 00:00:00 2001 From: zoidy Date: Sat, 5 Jul 2025 19:47:33 +0000 Subject: [PATCH 3/7] Add ability to specify multiple values to extract from json metadata for inclusion in bag tag files --- bagger/bag.py | 2 +- bagger/config/default.example.toml | 7 +- bagger/config/noupload_workflow.json | 354 +++++++++++++++++++++++++++ bagger/metadata.py | 50 ++-- 4 files changed, 393 insertions(+), 20 deletions(-) create mode 100644 bagger/config/noupload_workflow.json diff --git a/bagger/bag.py b/bagger/bag.py index 195e4dd..8d88d95 100644 --- a/bagger/bag.py +++ b/bagger/bag.py @@ -125,7 +125,7 @@ def _init_dart(self, package_path: PathLike) -> Union[Status, tuple[str, list]]: if not self.validate_package(metadata_path): return Status.INVALID_PACKAGE - metadata_tags = Metadata(self.config, metadata_path, + metadata_tags = Metadata(self.config, metadata_path, article_id, version, metadata_hash, self.log).parse_metadata() if not metadata_tags: diff --git a/bagger/config/default.example.toml b/bagger/config/default.example.toml index 0e83947..d570937 100644 --- a/bagger/config/default.example.toml +++ b/bagger/config/default.example.toml @@ -26,6 +26,7 @@ aptrust-info.Description = { tag_path = "description", strip_html = true } bag-info.Contact-Name = "ReDATA Administrator" bag-info.Contact-Email = "redata@arizona.edu" -bag-info.Internal-Sender-Identifier = { tag_path = "doi" } -bag-info.License-Name = { tag_path = "license.name" } -bag-info.Published-Date = { tag_path = "published_date" } +bag-info.Internal-Sender-Identifier = { tag_path = ["doi"] } +bag-info.License-Name = { tag_path = ["license.name"] } +bag-info.Published-Date = { tag_path = ["published_date"] } +bag-info.External-Identifier = { tag_path = ["id", "#version#", "authors.0.first_name", "authors.0.last_name", "#hash#"] } diff --git a/bagger/config/noupload_workflow.json b/bagger/config/noupload_workflow.json new file mode 100644 index 0000000..c4d3c6b --- /dev/null +++ b/bagger/config/noupload_workflow.json @@ -0,0 +1,354 @@ +{ + "id": "540d3238-ec1c-4d9d-a314-23e190930e88", + "name": "ReBACH", + "description": "ReBACH DART Workflow", + "packageFormat": "BagIt", + "packagePluginId": "BagIt", + "packagePluginName": null, + "bagItProfile": { + "id": "7f0cd963-4750-4c10-a3de-0c65bb0d0dc5", + "userCanDelete": true, + "required": [ + "name", + "id" + ], + "errors": {}, + "name": "ReDATA", + "description": "BagIt Profile for ReDATA BagIt preservation using ReBACH (based on APTrust BagIt Profile v2.2)", + "acceptBagItVersion": [ + "0.97", + "1.0" + ], + "acceptSerialization": [ + "application/tar" + ], + "allowFetchTxt": false, + "bagItProfileInfo": { + "bagItProfileIdentifier": "https://raw.githubusercontent.com/UAL-RE/ReBACH/main/bagger/profiles/redata-bagit-dart-v3.0.json", + "bagItProfileVersion": "", + "contactEmail": "redata@arizona.edu", + "contactName": "ReDATA Administrator", + "externalDescription": "Profile for ReDATA content. Based on APTrust BagIt profile v2.2. bagItProfileIdentifier points to a DART settings object which contains this profile.", + "sourceOrganization": "redata.arizona.edu", + "version": "3.0" + }, + "manifestsRequired": [ + "md5" + ], + "manifestsAllowed": [ + "md5", + "sha256" + ], + "tagManifestsRequired": [], + "tagManifestsAllowed": [ + "md5", + "sha256" + ], + "tagFilesAllowed": [ + "*" + ], + "tags": [ + { + "id": "39b8ac8a-8e3d-47c3-9cda-5edd0d4ad1fb", + "tagFile": "bagit.txt", + "tagName": "BagIt-Version", + "required": true, + "values": [ + "0.97", + "1.0" + ], + "defaultValue": "0.97", + "userValue": "", + "help": "Which version of the BagIt specification describes this bag's format?", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": false + }, + { + "id": "2a914ea2-ee3b-4c53-96e1-4f93f641338b", + "tagFile": "bagit.txt", + "tagName": "Tag-File-Character-Encoding", + "required": true, + "values": [ + "UTF-8" + ], + "defaultValue": "UTF-8", + "userValue": "", + "help": "How are this bag's plain-text tag files encoded? (Hint: usually UTF-8)", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": false + }, + { + "id": "567451b6-1f30-4bda-b66b-9a657426d5e5", + "tagFile": "bag-info.txt", + "tagName": "Source-Organization", + "required": true, + "values": [], + "defaultValue": "University of Arizona Libraries", + "userValue": "", + "help": "The name of the organization that produced this bag, or is responsible for its contents.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "117e46d8-096f-41f1-8c94-7d9202b9477b", + "tagFile": "bag-info.txt", + "tagName": "Bag-Count", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The number of bags that make up this object. Set this only if you are packaging a single object into multiple bags. See https://wiki.aptrust.org/Bagging_specifications for info on naming multi-part APTrust bags.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "41b75504-e54d-49a1-aad4-c8a4921d15ce", + "tagFile": "bag-info.txt", + "tagName": "Bagging-Date", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The date this bag was created. The bagging software should set this automatically.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "4d9e682c-4236-4adf-aaf2-c9d7666e3062", + "tagFile": "bag-info.txt", + "tagName": "Bagging-Software", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The name of the software that created this bag. The bagging software should set this automatically.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "32e69005-4495-452f-8b3d-bef545fca583", + "tagFile": "bag-info.txt", + "tagName": "Bag-Group-Identifier", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "Identifies the logical group or collection to which a bag belongs. Several bags may share the same Bag-Group-Identifier to indicate that they are part of the same logical grouping.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "917fc560-5bd1-4a5b-acb6-b7a4ce749252", + "tagFile": "bag-info.txt", + "tagName": "Internal-Sender-Description", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "A description of the bag's contents for the sender's internal use. This description will appear in the APTrust registry if you do not set the Description tag in the aptrust-info.txt file.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "018c0706-5597-4406-a705-205c608d827f", + "tagFile": "bag-info.txt", + "tagName": "Internal-Sender-Identifier", + "required": true, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "A unique identifier for this bag inside your organization. For ReDATA, it is the DOI.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "de2c8f3e-fadb-4811-88a2-83aafa44fb50", + "tagFile": "bag-info.txt", + "tagName": "Payload-Oxum", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The number of files and bytes in this bag's payload. This should be calculated and set by the bagging software.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "9b7344ae-9d06-4444-9d8a-dda7e5c2b8dc", + "tagFile": "aptrust-info.txt", + "tagName": "Title", + "required": true, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The title or name of that describes this bag's contents.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": false + }, + { + "id": "60ef466a-6d9c-4825-92cf-e472fb05f3d4", + "tagFile": "aptrust-info.txt", + "tagName": "Access", + "required": true, + "values": [ + "Consortia", + "Institution", + "Restricted" + ], + "defaultValue": "Institution", + "userValue": "Institution", + "help": "Access rights for this bag describe who can see that it exists in the repository.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "d94d1d47-49cb-4569-8d27-d9ebbf25c9b2", + "tagFile": "aptrust-info.txt", + "tagName": "Description", + "required": false, + "values": [], + "defaultValue": null, + "userValue": "", + "help": "The description of the bag that you want to appear in the APTrust registry.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {}, + "emptyOk": true + }, + { + "id": "53075007-e6cf-4a18-9b34-caa605ed593f", + "tagFile": "aptrust-info.txt", + "tagName": "Storage-Option", + "required": true, + "values": [ + "Standard", + "Glacier-OH", + "Glacier-OR", + "Glacier-VA", + "Glacier-Deep-OH", + "Glacier-Deep-OR", + "Glacier-Deep-VA", + "Wasabi-VA", + "Wasabi-OR" + ], + "defaultValue": "Glacier-Deep-OR", + "userValue": "", + "help": "How do you want this bag to be stored in APTrust? Standard = S3/Virginia + Glacier/Oregon. Glacier-OH = Glacier-only storage in Ohio. Glacier-OR = Glacier-only storage in Oregon. Glacier-VA = Glacier-only storage in Virginia. Standard storage includes regular 90-day fixity checks. Glacier-only storage is less expensive but excludes fixity checks. File in Glacier-only storage may take up to 24 hours longer to restore and excessive Glacier retrieval may incur additional fees.", + "isBuiltIn": true, + "isUserAddedFile": false, + "isUserAddedTag": false, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "22f51ea6-de98-494e-a876-dbaeb004cc9f", + "tagFile": "bag-info.txt", + "tagName": "License-Name", + "required": false, + "values": [ + "CC0", + "CC BY 4.0", + "CC BY-NC 4.0", + "CC BY-NC-SA 4.0", + "MIT", + "BSD 3-Clause", + "Apache 2.0", + "GPL 3.0+", + "LGPL-3.0", + "In Copyright" + ], + "defaultValue": "", + "userValue": "", + "help": "The name of the license assigned to the record in ReDATA.", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "317ede75-de6f-4bd5-9af0-67c2a24f9174", + "tagFile": "bag-info.txt", + "tagName": "External-Identifier", + "required": true, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "Format: {id of item on figshare}-{version}-{first author firstname}-{first author lastname}-{hash}", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false, + "errors": {} + }, + { + "id": "8a264c2d-9a75-4a07-917e-1763e2b57993", + "tagFile": "bag-info.txt", + "tagName": "Published-Date", + "required": false, + "values": [], + "defaultValue": "", + "userValue": "", + "help": "The date the item was published in ReDATA (Posted Date). Use YYYY-MM-DD format.", + "isBuiltIn": false, + "isUserAddedFile": false, + "isUserAddedTag": true, + "wasAddedForJob": false, + "errors": {} + } + ], + "serialization": "required", + "baseProfileId": "043f1c22-c9ff-4112-86f8-8f8f1e6a2dca", + "isBuiltIn": false, + "tarDirMustMatchName": true + }, + "storageServices": [ + ] +} \ No newline at end of file diff --git a/bagger/metadata.py b/bagger/metadata.py index 4c8da2a..3337bfa 100644 --- a/bagger/metadata.py +++ b/bagger/metadata.py @@ -12,18 +12,24 @@ class Metadata: - def __init__(self, config: dict, metadata_json_path: Path, log: Logger): + def __init__(self, config: dict, metadata_json_path: Path, article_id: str, version: str, ver_hash: str, log: Logger): """ Assemble metadata tags to embed in bags :param config: Config dict :param metadata_json_path: Path to package metadata JSON file + :param article_id: id of the article being processed + :param version: version of the article_id being processed + :param hash: identifying hash of the item being processed :param log: Logger object """ self.config: dict = config self.log: Logger = log self.metadata_json_path: Path = metadata_json_path self.metadata_config: dict = self.config['Metadata'] + self.article_id: str = article_id + self.version: str = version + self.hash: str = ver_hash self.tags: list[Tag] = [] @@ -52,7 +58,7 @@ def parse_metadata(self) -> Union[list[Tag], Literal[False]]: # Otherwise, the tag_value needs to be extracted from the metadata, and we need to # check for strip_html and shorten. try: - tag_path = tag_annotation['tag_path'] + tag_path_list = tag_annotation['tag_path'] except KeyError: self.log.error(f"Metadata key '{_tag_file}.{tag_name}' " f"must have tag_path defined") @@ -60,20 +66,32 @@ def parse_metadata(self) -> Union[list[Tag], Literal[False]]: strip_html = tag_annotation.get('strip_html', False) shorten = tag_annotation.get('shorten', False) - split_tag_path = tag_path.split('.') - - tag_value = self._descend_json(self.data, split_tag_path, tag_path) - - if not tag_value: - return False - - if strip_html: - tag_value = strip_tags(tag_value) - - if shorten: - tag_value = textwrap.shorten(tag_value, shorten) - - self.tags.append(Tag(tag_file, tag_name, tag_value)) + tag_value_sep = '-' + tag_value_list =[] + for tag_path in tag_path_list: + if tag_path.startswith('#') and tag_path.endswith('#'): + # Special case where we want to use article_id, version, or hash in tag files + try: + tag_value_list.append(getattr(self, tag_path.replace('#',''))) + except AttributeError: + print(f"Error: Variable '{tag_path.replace('#','')}' does not exist in class Metadata.") + else: + split_tag_path = tag_path.split('.') + + tag_value = self._descend_json(self.data, split_tag_path, tag_path) + + if not tag_value: + tag_value_list.append("") + else: + if strip_html: + tag_value = strip_tags(tag_value) + + if shorten: + tag_value = textwrap.shorten(tag_value, shorten) + + tag_value_list.append(str(tag_value)) + + self.tags.append(Tag(tag_file, tag_name, tag_value_sep.join(tag_value_list))) return self.tags From ebe0446a3f1e7cd281dd62354acb91e1e03720bd Mon Sep 17 00:00:00 2001 From: zoidy Date: Sat, 5 Jul 2025 20:00:20 +0000 Subject: [PATCH 4/7] Update bagger readme --- bagger/README.md | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/bagger/README.md b/bagger/README.md index 7f00729..45ea19c 100644 --- a/bagger/README.md +++ b/bagger/README.md @@ -96,7 +96,7 @@ class Status(IntEnum): DRY_RUN = SUCCESS ``` -Code that imports the `Bagger` module can use the name or value of the `Status` object: +Code that imports the `Bagger` module can use the name or value of the `Status` object: ```python if status == Status.INVALID_PATH: @@ -152,12 +152,12 @@ logfile_prefix = "ReBACH-Bagger" # Log filename prefix ### Wasabi -Both DART and ReBACH-Bagger use the credentials in this section to authenticate to Wasabi. +Both DART and ReBACH-Bagger use the credentials in this section to authenticate to Wasabi. ReBACH-Bagger checks Wasabi for duplicate bags. See [DART Workflow]("#dart-workflow") for details on how -these variables are used in DART. +these variables are used in DART. -If the `dart_workflow_hostbucket_override` variable is set to `true` -(default), the values of `host` and `bucket` defined here are used in the DART workflow defined in the +If the `dart_workflow_hostbucket_override` variable is set to `true` +(default), the values of `host` and `bucket` defined here are used in the DART workflow defined in the `workflow` variable above. If set to `false`, the values defined in the workflow itself are used instead. This option can only be set in the configuration file. @@ -192,7 +192,7 @@ the tag. ### Metadata from JSON -Users may also use an inline table to define a dot-notation `tag_path` to the tag's corresponding +Users may also use an inline table to define a dot-notation `tag_path` to the tag's corresponding key in the package's metadata file. Take the following abbreviated example of a metadata JSON file: ```json @@ -214,16 +214,29 @@ key in the package's metadata file. Take the following abbreviated example of a } ``` -To define a set of tags based on this metadata named "First-Author", "License", and "DOI" in the +To define a set of tags based on this metadata named "First-Author", "License", and "DOI" in the "bag-info.txt" tag file, users can define the following relationships in the config file: ```toml [Metadata] -bag-info.First-Author = { tag_path = "authors.0.full_name" } -bag-info.License = { tag_path = "license.name" } -bag-info.DOI = { tag_path = "doi" } +bag-info.First-Author = { tag_path = ["authors.0.full_name"] } +bag-info.License = { tag_path = ["license.name"] } +bag-info.DOI = { tag_path = ["doi"] } ``` +To extract multiple items and concatenate their values into a single tag, include more list items. + +```toml +bag-info.External-Identifier = { tag_path = ["authors.0.last_name", "#hash#"] } +``` + +Note the special value `#hash#`. This will not extract values from the JSON but instead from the name of the bag that will be created +- `#id#`: the article id +- `#version#`: the article version (in `vXX` format where XX is a zero-padded number from 1 to 99) +- `#hash#`: The metadata hash +In the example, the value of External-Identifier will be set to `Avants-` where `` is a 32 character MD5 hash. + + ### Metadata Utilities ReBACH-Bagger can strip HTML tags out of a metadata value. To enable this functionality, use a From d651ab7f5276dc0ac9c5f818e505601212a190ac Mon Sep 17 00:00:00 2001 From: zoidy Date: Mon, 7 Jul 2025 12:51:12 +0000 Subject: [PATCH 5/7] Add clarifications to bagger readme; lint --- bagger/README.md | 8 +++++--- bagger/metadata.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bagger/README.md b/bagger/README.md index 45ea19c..5affec1 100644 --- a/bagger/README.md +++ b/bagger/README.md @@ -227,14 +227,16 @@ bag-info.DOI = { tag_path = ["doi"] } To extract multiple items and concatenate their values into a single tag, include more list items. ```toml -bag-info.External-Identifier = { tag_path = ["authors.0.last_name", "#hash#"] } +bag-info.External-Identifier = { tag_path = ["authors.0.full_name", "#hash#"] } ``` -Note the special value `#hash#`. This will not extract values from the JSON but instead from the name of the bag that will be created +Note the special value `#hash#`. This will not extract values from the JSON but instead from the name of the bag that will be created. Available values: + - `#id#`: the article id - `#version#`: the article version (in `vXX` format where XX is a zero-padded number from 1 to 99) - `#hash#`: The metadata hash -In the example, the value of External-Identifier will be set to `Avants-` where `` is a 32 character MD5 hash. + +In the example, the value of External-Identifier will be set to `Brian Avants-` where `` is the 32 character MD5 hash computed by bagger for the bag name. ### Metadata Utilities diff --git a/bagger/metadata.py b/bagger/metadata.py index 3337bfa..665030a 100644 --- a/bagger/metadata.py +++ b/bagger/metadata.py @@ -67,14 +67,14 @@ def parse_metadata(self) -> Union[list[Tag], Literal[False]]: strip_html = tag_annotation.get('strip_html', False) shorten = tag_annotation.get('shorten', False) tag_value_sep = '-' - tag_value_list =[] + tag_value_list = [] for tag_path in tag_path_list: if tag_path.startswith('#') and tag_path.endswith('#'): # Special case where we want to use article_id, version, or hash in tag files try: - tag_value_list.append(getattr(self, tag_path.replace('#',''))) + tag_value_list.append(getattr(self, tag_path.replace('#', ''))) except AttributeError: - print(f"Error: Variable '{tag_path.replace('#','')}' does not exist in class Metadata.") + print(f"Error: Variable '{tag_path.replace('#', '')}' does not exist in class Metadata.") else: split_tag_path = tag_path.split('.') From c0ff3e20295bf2f56447227729da304df285154c Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Mon, 7 Jul 2025 14:59:48 -0400 Subject: [PATCH 6/7] Update bagger config to use the noupload workflow The workflow instructs dart-runner to not upload bags to Wasabi --- bagger/README.md | 2 ++ bagger/config/default.example.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bagger/README.md b/bagger/README.md index 5affec1..6653312 100644 --- a/bagger/README.md +++ b/bagger/README.md @@ -140,6 +140,8 @@ workflow = "default_workflow.json" # Path to the DART workflow file dart_command = "dart-runner" # Command or path to DART executable ``` +There are currently two workflows `default_workflow.json` and `noupload_workflow.json`. The only difference is that the latter instructs dart-runner to not upload files to any remote storage locations. + ### Logging ReBACH-Bagger logs errors, debug messages, and DART output to disk. diff --git a/bagger/config/default.example.toml b/bagger/config/default.example.toml index d570937..d393ee2 100644 --- a/bagger/config/default.example.toml +++ b/bagger/config/default.example.toml @@ -1,6 +1,6 @@ [Defaults] output_dir = "out" -workflow = "bagger/config/default_workflow.json" +workflow = "bagger/config/noupload_workflow.json" dart_command = "dart-runner" overwrite = false delete = true From ae51964186edd5d16955ac3b6f6328ee36876cc0 Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Mon, 7 Jul 2025 15:29:25 -0400 Subject: [PATCH 7/7] Add ability to include string literals in bag tags --- bagger/README.md | 7 ++++++- bagger/metadata.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bagger/README.md b/bagger/README.md index 6653312..1f79f31 100644 --- a/bagger/README.md +++ b/bagger/README.md @@ -238,7 +238,12 @@ Note the special value `#hash#`. This will not extract values from the JSON but - `#version#`: the article version (in `vXX` format where XX is a zero-padded number from 1 to 99) - `#hash#`: The metadata hash -In the example, the value of External-Identifier will be set to `Brian Avants-` where `` is the 32 character MD5 hash computed by bagger for the bag name. +In the example, the value of External-Identifier will be set to `Brian Avants-` where `` is the 32 character MD5 hash computed by bagger for the bag name. To include literal text in the tag, enclose it in `@`. + +```toml +bag-info.External-Identifier = { tag_path = ["@azu_@", "authors.0.full_name", "#hash#"] } +``` +sets External-Identifier to `azu_Brian Avants-` ### Metadata Utilities diff --git a/bagger/metadata.py b/bagger/metadata.py index 665030a..f669985 100644 --- a/bagger/metadata.py +++ b/bagger/metadata.py @@ -75,6 +75,8 @@ def parse_metadata(self) -> Union[list[Tag], Literal[False]]: tag_value_list.append(getattr(self, tag_path.replace('#', ''))) except AttributeError: print(f"Error: Variable '{tag_path.replace('#', '')}' does not exist in class Metadata.") + elif tag_path.startswith('@') and tag_path.endswith('@'): + tag_value_list.append(tag_path.replace('@', '')) else: split_tag_path = tag_path.split('.')