Skip to content

dataherb.flora

dataherb.flora¤

Flora ¤

A container of datasets. It loads a local folder of dataset metadata and forms a list of dataset objects.

The provided local path or remote resource will then be converted to a list of dataherb objects.

Parameters:

Name Type Description Default
flora

path to the flora database. Either an URL or a local path.

required
is_aggregated bool

if True, the flora is aggregated into one json file.

False
Source code in dataherb/flora.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
class Flora:
    """
    A container of datasets. It loads a local folder of dataset metadata and
    forms a list of dataset objects.

    The provided local path or remote resource will then be converted to a list
    of dataherb objects.

    :param flora: path to the flora database. Either an URL or a local path.
    :param is_aggregated: if True, the flora is aggregated into one json file.
    """

    def __init__(self, flora_path: Union[Path, URL], is_aggregated: bool = False):
        self.is_aggregated = is_aggregated

        if not isinstance(flora_path, (Path, URL)):
            raise Exception(f"flora must be a path or a url. ({flora_path})")

        if isinstance(flora_path, URL):
            self.flora = self._get_remote_flora(flora_path)

        if isinstance(flora_path, Path):
            if flora_path.suffix == ".json":
                self.is_aggregated = True
            self.workdir = flora_path.parent.parent
            self.flora_path = flora_path
            self.flora = self._get_local_flora(flora_path)

        if is_aggregated != self.is_aggregated:
            logger.warning(
                f"flora has is_aggregated={self.is_aggregated}, "
                "but was specified as is_aggregated={is_aggregated}."
            )

        logger.debug(f"flora workdir {self.workdir}")

    def _get_local_flora(self, flora_config: Path) -> List[Herb]:
        """
        _get_local_flora fetch flora from the local folder or file.

        There are two scenarios:

        - The flora is one aggregated local json file.
        - The flora is a folder that contains folders of dataset ids.
        """
        if self.is_aggregated:
            with open(flora_config, "r") as f:
                json_flora = json.load(f)
        else:
            flora_folder = Path(flora_config)
            herb_paths = [f for f in flora_folder.iterdir() if f.is_dir()]
            json_flora = [
                json.load(open(f.joinpath("dataherb.json"), "r")) for f in herb_paths
            ]

        return [
            Herb(herb, base_path=self.workdir / f'{herb.get("id", "")}')
            for herb in json_flora
        ]

    def _get_remote_flora(self, flora_config: URL) -> List[Herb]:
        """
        _get_remote_flora fetch flora from the remote API.

        !!! warning
            Currently, this mode only works for aggregated json flora.
        """
        flora_request = get_data_from_url(flora_config)

        if not flora_request.status_code == 200:
            raise Exception(
                "Could not download dataherb flora from remote. status code: {}".format(
                    flora_request.status_code
                )
            )
        else:
            json_flora = flora_request.json()

        return [
            Herb(herb, base_path=self.workdir / f'{herb.get("id", "")}')
            for herb in json_flora
        ]

    def add(self, herb: Union[Herb, dict, MetaData]) -> None:
        """
        Add a herb to the flora.
        """

        herb = self._convert_to_herb(herb)

        logger.debug(f"adding herb with metadata: {herb.metadata}")

        for h_exist in self.flora:
            if herb.id == h_exist.id:
                raise Exception(f"herb id = {herb.id} already exists")

        self.flora.append(herb)
        if self.is_aggregated:
            self.save(path=self.flora_path)
        else:
            self.save(herb=herb)

    def _convert_to_herb(self, herb: Union[Herb, dict, MetaData]) -> Herb:
        if isinstance(herb, MetaData):
            herb = Herb(herb.metadata)
        elif isinstance(herb, dict):
            herb = Herb(herb)
        elif isinstance(herb, Herb):
            pass
        else:
            raise Exception(f"Input herb type ({type(herb)}) is not supported.")

        return herb

    def remove(self, herb_id: str) -> None:
        """
        Removes a herb from the flora.
        """
        for id in [i.id for i in self.flora]:
            if id == herb_id:
                logger.debug(f"found herb id = {herb_id}")

        self.flora = [h for h in self.flora if h.id != herb_id]

        if self.is_aggregated:
            self.save(path=self.flora_path)
        else:
            self.remove_herb_from_flora(herb_id)

    def save(
        self,
        path: Optional[Path] = None,
        id: Optional[str] = None,
        herb: Optional[Herb] = None,
    ) -> None:
        """save flora metadata to json file"""

        if path is None:
            path = self.flora_path

        logger.debug(
            f"type of a herb in flora: {type(self.flora[0])}\n{self.flora[0].metadata}"
        )

        if self.is_aggregated:
            serialized_flora = []
            for h in self.flora:
                logger.debug(f"herb (type {type(h)}): {h}")
                serialized_flora.append(h.metadata)

            with open(path, "w") as fp:
                json.dump(
                    serialized_flora,
                    fp,
                    sort_keys=True,
                    indent=4,
                    separators=(",", ": "),
                )
        else:
            if (not id) and (not herb):
                raise Exception("dataherb id must be provided")
            elif herb:
                logger.debug(f"Saving herb using herb object")
                self.save_herb_meta(id=herb.id, path=path / f"{herb.id}")
            elif id:
                logger.debug(f"Saving herb using herb id")
                self.save_herb_meta(id, path / f"{id}")

    def save_herb_meta(self, id: str, path: Optional[Path] = None) -> None:
        """Save a herb metadata to json file"""
        if path is None:
            path = self.workdir / f"{id}"

        if not path.exists():
            path.mkdir(parents=True)

        logger.debug(f"Will replace dataherb id {id}")
        with open(path / "dataherb.json", "w") as fp:
            json.dump(
                self.herb_meta(id), fp, sort_keys=True, indent=4, separators=(",", ": ")
            )

    def remove_herb_from_flora(self, id: str, path: Optional[Path] = None) -> None:
        """Remove a herb metadata to json file"""
        if path is None:
            path = self.workdir / f"{id}"

        if not path.exists():
            logger.debug(f"dataherb {id} doesn't exist")
            return
        else:
            try:
                shutil.rmtree(path)
            except OSError as e:
                logger.error(
                    f"Can not remove herb id {id}: {e.filename} - {e.strerror}."
                )

    def search(self, keywords: Union[str, List[str]]) -> List[dict]:
        """
        search finds the datasets that matches the keywords

        :param keywords: keywords to be searched
        """
        if isinstance(keywords, str):
            keywords = [keywords]

        return search_by_keywords_in_flora(flora=self.flora, keywords=keywords)

    def herb_meta(self, id: str) -> Optional[dict]:
        """
        herb loads the dataset

        :param id: herb id of the dataset
        """

        herbs = _search_by_ids_in_flora(self.flora, id)

        if herbs:
            herb = herbs[0].get("herb")
            if herb:
                return herb.metadata
            else:
                return None
        else:
            return None

    def herb(self, id: str) -> Optional[Herb]:
        """
        herb loads the dataset as dataframes.

        :param id: herb id
        """

        herbs = _search_by_ids_in_flora(self.flora, id)
        if len(herbs) > 1:
            logger.error(
                f"Found multiple datasets with id {id}, please fix this in your flora data json file, e.g, WORKDIRECTORY/flora/flora.json."
            )

        if herbs:
            herb = herbs[0].get("herb")
            if herb:
                return herb
            else:
                return None
        else:
            logger.error(f"Could not find herb {id}")
            return None

_get_local_flora(flora_config) ¤

_get_local_flora fetch flora from the local folder or file.

There are two scenarios:

  • The flora is one aggregated local json file.
  • The flora is a folder that contains folders of dataset ids.
Source code in dataherb/flora.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def _get_local_flora(self, flora_config: Path) -> List[Herb]:
    """
    _get_local_flora fetch flora from the local folder or file.

    There are two scenarios:

    - The flora is one aggregated local json file.
    - The flora is a folder that contains folders of dataset ids.
    """
    if self.is_aggregated:
        with open(flora_config, "r") as f:
            json_flora = json.load(f)
    else:
        flora_folder = Path(flora_config)
        herb_paths = [f for f in flora_folder.iterdir() if f.is_dir()]
        json_flora = [
            json.load(open(f.joinpath("dataherb.json"), "r")) for f in herb_paths
        ]

    return [
        Herb(herb, base_path=self.workdir / f'{herb.get("id", "")}')
        for herb in json_flora
    ]

_get_remote_flora(flora_config) ¤

_get_remote_flora fetch flora from the remote API.

Warning

Currently, this mode only works for aggregated json flora.

Source code in dataherb/flora.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def _get_remote_flora(self, flora_config: URL) -> List[Herb]:
    """
    _get_remote_flora fetch flora from the remote API.

    !!! warning
        Currently, this mode only works for aggregated json flora.
    """
    flora_request = get_data_from_url(flora_config)

    if not flora_request.status_code == 200:
        raise Exception(
            "Could not download dataherb flora from remote. status code: {}".format(
                flora_request.status_code
            )
        )
    else:
        json_flora = flora_request.json()

    return [
        Herb(herb, base_path=self.workdir / f'{herb.get("id", "")}')
        for herb in json_flora
    ]

add(herb) ¤

Add a herb to the flora.

Source code in dataherb/flora.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def add(self, herb: Union[Herb, dict, MetaData]) -> None:
    """
    Add a herb to the flora.
    """

    herb = self._convert_to_herb(herb)

    logger.debug(f"adding herb with metadata: {herb.metadata}")

    for h_exist in self.flora:
        if herb.id == h_exist.id:
            raise Exception(f"herb id = {herb.id} already exists")

    self.flora.append(herb)
    if self.is_aggregated:
        self.save(path=self.flora_path)
    else:
        self.save(herb=herb)

herb(id) ¤

herb loads the dataset as dataframes.

Parameters:

Name Type Description Default
id str

herb id

required
Source code in dataherb/flora.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def herb(self, id: str) -> Optional[Herb]:
    """
    herb loads the dataset as dataframes.

    :param id: herb id
    """

    herbs = _search_by_ids_in_flora(self.flora, id)
    if len(herbs) > 1:
        logger.error(
            f"Found multiple datasets with id {id}, please fix this in your flora data json file, e.g, WORKDIRECTORY/flora/flora.json."
        )

    if herbs:
        herb = herbs[0].get("herb")
        if herb:
            return herb
        else:
            return None
    else:
        logger.error(f"Could not find herb {id}")
        return None

herb_meta(id) ¤

herb loads the dataset

Parameters:

Name Type Description Default
id str

herb id of the dataset

required
Source code in dataherb/flora.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def herb_meta(self, id: str) -> Optional[dict]:
    """
    herb loads the dataset

    :param id: herb id of the dataset
    """

    herbs = _search_by_ids_in_flora(self.flora, id)

    if herbs:
        herb = herbs[0].get("herb")
        if herb:
            return herb.metadata
        else:
            return None
    else:
        return None

remove(herb_id) ¤

Removes a herb from the flora.

Source code in dataherb/flora.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def remove(self, herb_id: str) -> None:
    """
    Removes a herb from the flora.
    """
    for id in [i.id for i in self.flora]:
        if id == herb_id:
            logger.debug(f"found herb id = {herb_id}")

    self.flora = [h for h in self.flora if h.id != herb_id]

    if self.is_aggregated:
        self.save(path=self.flora_path)
    else:
        self.remove_herb_from_flora(herb_id)

remove_herb_from_flora(id, path=None) ¤

Remove a herb metadata to json file

Source code in dataherb/flora.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def remove_herb_from_flora(self, id: str, path: Optional[Path] = None) -> None:
    """Remove a herb metadata to json file"""
    if path is None:
        path = self.workdir / f"{id}"

    if not path.exists():
        logger.debug(f"dataherb {id} doesn't exist")
        return
    else:
        try:
            shutil.rmtree(path)
        except OSError as e:
            logger.error(
                f"Can not remove herb id {id}: {e.filename} - {e.strerror}."
            )

save(path=None, id=None, herb=None) ¤

save flora metadata to json file

Source code in dataherb/flora.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def save(
    self,
    path: Optional[Path] = None,
    id: Optional[str] = None,
    herb: Optional[Herb] = None,
) -> None:
    """save flora metadata to json file"""

    if path is None:
        path = self.flora_path

    logger.debug(
        f"type of a herb in flora: {type(self.flora[0])}\n{self.flora[0].metadata}"
    )

    if self.is_aggregated:
        serialized_flora = []
        for h in self.flora:
            logger.debug(f"herb (type {type(h)}): {h}")
            serialized_flora.append(h.metadata)

        with open(path, "w") as fp:
            json.dump(
                serialized_flora,
                fp,
                sort_keys=True,
                indent=4,
                separators=(",", ": "),
            )
    else:
        if (not id) and (not herb):
            raise Exception("dataherb id must be provided")
        elif herb:
            logger.debug(f"Saving herb using herb object")
            self.save_herb_meta(id=herb.id, path=path / f"{herb.id}")
        elif id:
            logger.debug(f"Saving herb using herb id")
            self.save_herb_meta(id, path / f"{id}")

save_herb_meta(id, path=None) ¤

Save a herb metadata to json file

Source code in dataherb/flora.py
191
192
193
194
195
196
197
198
199
200
201
202
203
def save_herb_meta(self, id: str, path: Optional[Path] = None) -> None:
    """Save a herb metadata to json file"""
    if path is None:
        path = self.workdir / f"{id}"

    if not path.exists():
        path.mkdir(parents=True)

    logger.debug(f"Will replace dataherb id {id}")
    with open(path / "dataherb.json", "w") as fp:
        json.dump(
            self.herb_meta(id), fp, sort_keys=True, indent=4, separators=(",", ": ")
        )

search(keywords) ¤

search finds the datasets that matches the keywords

Parameters:

Name Type Description Default
keywords Union[str, List[str]]

keywords to be searched

required
Source code in dataherb/flora.py
221
222
223
224
225
226
227
228
229
230
def search(self, keywords: Union[str, List[str]]) -> List[dict]:
    """
    search finds the datasets that matches the keywords

    :param keywords: keywords to be searched
    """
    if isinstance(keywords, str):
        keywords = [keywords]

    return search_by_keywords_in_flora(flora=self.flora, keywords=keywords)