1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

""" 

:Author: Daniel Mohr 

:Email: daniel.mohr@dlr.de 

:Date: 2021-03-09 (last change). 

:License: GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007. 

""" 

 

import bz2 

import gzip 

import json 

import lzma 

import os 

import os.path 

import ssl 

import urllib.request 

 

 

def lzma_open(filename, mode, *params, container_format=lzma.FORMAT_ALONE): 

""" 

:Author: Daniel Mohr 

:Date: 2021-03-09 

""" 

return lzma.open(filename, mode, format=container_format, *params) 

 

 

def get_schema_org_data(cachefilepath='', cachefilename=''): 

""" 

:Author: Daniel Mohr 

:Date: 2021-03-09 

 

This function returns the data from 

https://schema.org/version/latest/schemaorg-current-https.jsonld 

as a json-ld structure (parsed by the python module json). 

 

:param cachefilepath: This path is used as the path for the cachefilename. 

:param cachefilename: If not set to an empty string, the data is read 

from this file. If this file does not exists, 

the data is loadd from the website and stored 

in this file. 

 

:return: json-ld structure (parsed by the python module json) as 

dicts and lists 

""" 

schema_org_data = None 

opencmds = {'default': open, 

'.jsonld': open, 

'.gz': gzip.open, 

'.lzma': lzma_open, 

'.xz': lzma.open, 

'.bz2': bz2.open} 

cachefilenamepath = os.path.join(cachefilepath, cachefilename) 

if bool(cachefilename): # len(cachefilename) > 0 

_, ext = os.path.splitext(cachefilenamepath) 

open_cmd = opencmds.get(ext.lower(), opencmds['default']) 

if bool(cachefilename) and os.path.isfile(cachefilenamepath): 

with open_cmd(cachefilenamepath, 'rb') as fd: 

schema_org_data = json.load(fd) 

else: 

url = \ 

'https://schema.org/version/latest/schemaorg-current-https.jsonld' 

context = ssl.create_default_context() 

with urllib.request.urlopen(url, context=context) as fd: 

schema_org_data = json.load(fd) 

if (schema_org_data is not None) and bool(cachefilename): 

if bool(cachefilepath) and (not os.path.isdir(cachefilepath)): 

os.mkdir(cachefilepath) 

with open_cmd(cachefilenamepath, 'wb') as fd: 

fd.write(json.dumps(schema_org_data,).encode()) 

return schema_org_data