datasets

class marius.tools.preprocess.dataset.Dataset(output_directory, spark=False)

Abstract dataset class

Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
relation_mapping_file: pathlib.Path
node_type_file: pathlib.Path
dataset_name: str
dataset_url: str
__init__(output_directory, spark=False)
output_directory: pathlib.Path
spark: bool
edge_list_file: pathlib.Path
node_features_file: pathlib.Path
relation_features_file: pathlib.Path
abstract download(overwrite=False)
abstract preprocess()
Return type

DatasetConfig

class marius.tools.preprocess.datasets.fb15k.FB15K(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.fb15k_237.FB15K237(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.freebase86m.Freebase86m(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.friendster.Friendster(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, generate_random_features=False, node_feature_dim=32, num_classes=50, node_splits=[0.1, 0.05, 0.05], partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.livejournal.Livejournal(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=[0.9, 0.05, 0.05], sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogb_mag240m.OGBMag240M(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogb_wikikg90mv2.OGBWikiKG90Mv2(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbl_citation2.OGBLCitation2(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbl_ppa.OGBLPpa(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False, remap_ids=True)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbl_wikikg2.OGBLWikiKG2(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbn_arxiv.OGBNArxiv(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbn_papers100m.OGBNPapers100M(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.ogbn_products.OGBNProducts(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool
class marius.tools.preprocess.datasets.twitter.Twitter(output_directory, spark=False)
Parameters
  • output_directory (pathlib.Path) –

  • spark (bool) –

__init__(output_directory, spark=False)
Parameters

output_directory (pathlib.Path) –

dataset_name: str
dataset_url: str
download(overwrite=False)
preprocess(num_partitions=1, remap_ids=True, splits=[0.9, 0.05, 0.05], sequential_train_nodes=False)
edge_list_file: pathlib.Path
edge_features_file: pathlib.Path
node_mapping_file: pathlib.Path
node_features_file: pathlib.Path
relation_mapping_file: pathlib.Path
relation_features_file: pathlib.Path
node_type_file: pathlib.Path
output_directory: pathlib.Path
spark: bool