diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 000000000..c42a27cd5 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +github: AlexKuhnle +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: TensorforceTeam +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/.gitignore b/.gitignore index ccb5bbaad..afda85fe4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,24 @@ +*__pycache__* +*.DS_Store +*.egg *.pyc +*.swp +*.vscode + +*/.ipynb_checkpoints/ + +/CONTRIBUTING.html +/PROJECTS.html +/README.html +/UPDATE_NOTES.html + +/.eggs/ +/.pytest_cache/ +/_vizdoom/ +/build/ /dist/ -/*.egg-info -/.idea/ -/logs/* -/tensorforce/logs/* -/tensorforce_0.12 -/log_* -/logs_async /docs/_* -tf_worker_*.txt -.DS_Store -*.eggs -*.vscode +/examples/vizdoom/ +/tensorforce.egg-info/ + +/_old* diff --git a/.travis.yml b/.travis.yml index 2a40bf784..464255a95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,35 +1,24 @@ -sudo: required -dist: trusty language: python +python: + - "3.7" -services: - - docker - -matrix: - include: - - env: - - DOCKER_JOB_NAME='tensorforce_pytest_py2' - - DOCKER_IMAGE='tensorflow/tensorflow:latest' - - env: - - DOCKER_JOB_NAME='tensorforce_pytest_py3' - - DOCKER_IMAGE='tensorflow/tensorflow:latest-py3' +branches: + only: + - master before_install: - - docker pull ${DOCKER_IMAGE} - - docker run -d --rm --name ${DOCKER_JOB_NAME} -v $(pwd):/tensorforce ${DOCKER_IMAGE} /bin/bash -c 'while true; do sleep 1; done'; + - sudo apt-get -y install swig install: - - docker exec ${DOCKER_JOB_NAME} /bin/sh -c 'python -m pip install -U pip'; - - docker exec ${DOCKER_JOB_NAME} /bin/sh -c 'python -m pip install pytest-xdist'; - - docker exec ${DOCKER_JOB_NAME} /bin/sh -c 'python -m pip install gym==0.9.5'; - - docker exec ${DOCKER_JOB_NAME} /bin/sh -c 'cd /tensorforce && python -m pip install .'; + - pip install .[tfa,tune,ale,gym,retro] + - pip install pytest script: - - docker exec ${DOCKER_JOB_NAME} /bin/sh -c 'cd /tensorforce && python -m pytest'; + - pytest notifications: email: recipients: - - contact@reinforce.io + - tensorforce.team@gmail.com on_success: never - on_failure: always \ No newline at end of file + on_failure: always diff --git a/BUILD b/BUILD deleted file mode 100644 index 9ee224080..000000000 --- a/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -package(default_visibility = ["//visibility:public"]) - -tensorforce_args = [ - "--agent VPGAgent", - "--agent-config /configs/vpg_baseline_visual.json", - "--network-config /configs/cnn_dqn_network.json", - "--episodes 1000", - "--max-timesteps 1000" -] - -py_library( - name = "tensorforce", - imports = [":tensorforce"], - data = ["//tensorforce:examples/configs/vpg_baseline_visual.json", - "//tensorforce:examples/configs/cnn_dqn_network.json"], - srcs = glob(["tensorforce/**/*.py"]) -) - -py_binary( - name = "lab_runner", - srcs = ["examples/lab_main.py"], - args = tensorforce_args, - data = ["//:deepmind_lab.so"], - main = "examples/lab_main.py", - deps = [":tensorforce"] -) - - - diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6f7c6233d..455e4857e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,168 +1,23 @@ -Contribution guide ------------------- +# Contribution guide -Below are some pointers for new contributions. In general, it is probably always a good idea to -join the community to discuss a contribution, unless it's a smaller bug fix. You can join the -community by filling in -[this short form](https://docs.google.com/forms/d/1_UD5Pb5LaPVUviD0pO0fFcEnx_vwenvuc00jmP2rRIc/) -which will take you to the chat after. +Please always get in touch on [Gitter](https://gitter.im/tensorforce/community) before start working on a pull request, unless it is a smaller bug fix involving only a few lines of code. -### 1. Code style -In general, we try to follow the -[Google Python style guide](https://google.github.io/styleguide/pyguide.html) with a few -particulars. Another good rule of thumb is that if something is a PEP8 warning in your editor, it -is probably worth looking at. +### Code style -Some things to pay attention to: - -- Lines should have a max length of 120 characters, 100 for documentation comments. - -- When initializing objects such as dictionaries or lists and there are multiple entries, use the -following format: - -```python -# One key-value pair per line, one indent level. -dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward -) -``` - -- When calling TensorFlow functions, use named arguments for readability wherever possible: +- [Google Python style guide](https://google.github.io/styleguide/pyguide.html) +- Maximum line length: 100 characters; tab size: 4 spaces +- There should be no PEP8 warnings (apart from E501 regarding line length) +- If arguments, when initializing objects / calling functions / specifying lists/dicts / etc, do not fit into the same line, should be in one (or multiple) separate tab-indented line(s), like this: ```python -scaffold = tf.train.Scaffold( - init_op=init_op, - init_feed_dict=None, - init_fn=init_fn, - ready_op=ready_op, - ready_for_local_init_op=ready_for_local_init_op, - local_init_op=local_init_op, - summary_op=summary_op, - saver=saver, - copy_from_scaffold=None +super().__init__( + states=states, actions=actions, l2_regularization=l2_regularization, + parallel_interactions=parallel_interactions, config=config, saver=saver, summarizer=summarizer ) ``` -- Indentations should always be tab-spaced (tab size: 4), instead of based on alignments to the previous line: - -```python -states_preprocessing_variables = [ - variable for name in self.states_preprocessing.keys() - for variable in self.states_preprocessing[name].get_variables() -] -``` - -instead of: - -```python -states_preprocessing_variables = [variable for name in self.states_preprocessing.keys() - for variable in self.states_preprocessing[name].get_variables()] -``` - -or: - -```python -kwargs['fn_loss'] = (lambda: self.fn_loss( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - update=update -)) -``` - -instead of: - -```python -kwargs['fn_loss'] = ( - lambda: self.fn_loss(states=states, internals=internals, actions=actions, - terminal=terminal, reward=reward, update=update) -) -``` - -- Binary operators should always be surrounded by a single space: `z = x + y` instead of `z=x+y`. - -- Numbers should always be explicitly given according to their intended type, so floats always with period, `1.0`, and integers without, `1`. Floats should furthermore explicitly add single leading/trailing zeros where applicable, so `1.0` instead of `1.` and `0.1` instead of `.1`. - -- Lines (apart from documentation comments), including empty lines, should never contain trailing -spaces. - -- Comments, even line comments, should be capitalised. - -- We prefer line comments to be above the line of code they are commenting on for shorter lines: - -```python -# This is a line comment. -input_values = dict() -``` - -instead of: - -```python -input_values = dict() # this is a non-capitalised line comment making the line unnecessarily long -``` - - -### 2. Architecture - -New contributions should integrate into the existing design ideas. To this end, reading our -[blog](https://reinforce.io/blog/)) can be very helpful. The key design elements to understand are -the optimizer package (as described in the blog), the idea to move all reinforcement learning -control flow into the TensorFlow graph, and the general object hierarchy of models. Again, for -detailed questions do join the chat. - - -### 3. Areas of contribution - -Below are some potential areas of contribution. Feel free to make new suggestions on your own. - -Environments: - -TensorForce provides a generic -[enviroment class](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/environments/environment.py). -Applications do not need to implement this but it provides the advantage of using tne ```Runner``` -execution infrastructure. A number of implemented environments can be found in the contrib folder. -Implementing a binding for a new environment is a great way to better understand the agent API and -makes for a good first contribution. Below is a list of environments one might look at: - -- Gazebo robotic simulation - [link](http://gazebosim.org) -- Carla, Open-source simulator for autonomous driving research - [link](https://github.com/carla-simulator/carla) -- Unity game engine - [link](https://github.com/Unity-Technologies/ml-agents) -- Project Malmo minecraft bindnig - [link](https://github.com/Microsoft/malmo) -- DeepMind Starcraft 2 learning environment - [link](https://github.com/deepmind/pysc2) -- DeepMind control, dm_control - [link](https://github.com/deepmind/dm_control) -- DeepMind pycolab - [link](https://github.com/deepmind/pycolab) -- OpenAI roboschool - [link](https://github.com/openai/roboschool) -- DeepGTAV - GTA 5 self-driving car research environment - [link](https://github.com/aitorzip/DeepGTAV) -- PyGame learning environment - [link](https://github.com/ntasfi/PyGame-Learning-Environment) -- Siemens industrial control benchmark - [link](https://github.com/siemens/industrialbenchmark) -- ViZDoom, Doom based research paltform - [link](https://github.com/mwydmuch/ViZDoom) - -Models: - -Reinforcement learning is a highly active field of research and new models are appearing with high -frequency. Our main development focus is on providing abstractions and architecture, so model -contributions are very welcome. Note that integrating some models may require some discussion on -interfacing the existing models, especially in the case of newer architectures with complex -internal models. Some model suggestions: - -- ACER - [paper](https://arxiv.org/abs/1611.01224) -- Direct future prediction (n.b. this will require architecture changes) - [paper](https://arxiv.org/abs/1611.01779) -- Categorical DQN reimplementation. A categorical DQN implementation was part of 0.2.2 but was removed - because it did not easily integrate into the optimizer architecture. If you are interested in this model, - please comment in the issue or join the chat for discussion. -- Rainbow DQN, needs categorical DQN first. - [paper](https://arxiv.org/abs/1710.02298) -- Distributional RL with quantile regression - [paper](https://arxiv.org/pdf/1710.10044.pdf) - -Ecosystem integrations: - -If you are interested in general usability, another area of contribution is integrations into the -wider machine learning and data processing ecosystem. For example, providing scripts to run -TensorForce on one of a number of cloud service providers, or to run jobs on data infrastructure -frameworks like Kubernetes, Spark, etc is a great way to make RL more accessible. +- TensorFlow as well as Tensorforce-internal function calls should use named arguments wherever possible +- Binary operators should always be surrounded by a single space, so `z = x + y` instead of `z=x+y` +- Numbers should always be specified according to their intended type, so `1.0` as opposed to `1` in the case of floats, and vice versa for integers. For clarity, floats should furthermore add single leading/trailing zeros where necessary, so `1.0` instead of `1.` and `0.1` instead of `.1`. +- Line comments should generally be in a separate line preceding the line(s) they are commenting on, and not be added after the code as a suffix. diff --git a/FAQ.md b/FAQ.md deleted file mode 100644 index 751047e0c..000000000 --- a/FAQ.md +++ /dev/null @@ -1,142 +0,0 @@ -TensorForce FAQ -=============== - -### 1. How can I use TensorForce in a new environment or application? - -This depends on the control flow of your problem. For most simulations, it is convenient to -implement a binding to the TensorForce `Environment` class (various examples in contrib). The -advantage of this is that it allows you to use the existing execution scripts, in particular the -`Runner` utility. The general workflow is to copy one of the example scripts in the examples folder -which parse arguments and call the runner. The runner will then control execution by calling your -environment for the specified number of steps. - -If you have a real-world environment, things are generally different as you may not be able to -delegate control flow to TensorForce. Instead, your external application might use TensorFlow as a -library, and call `act()` and `observe()` when new data is available. Consider the quickstart -example in the readme. - - -### 2. Why is my algorithm not learning? - -Generally, this is either because there is a bug in our implementation or a problem in your -configuration or application. The reality of reinforcement learning is that getting things to work -is *very* difficult and we will not be able to tell you why your specific thing is not working. -Newcomers with some experience in deep learning, where successfully training small networks is -easy, often carry over this expectation to reinforcement learning. Issues that simply -ask for configuration help without doing some research (small ablation analysis, using some -known hyper-parameters from papers, or reasonable argument why something should work) will be -closed with reference to this document. - -Please appreciate that for almost every problem, none of the default configurations will work, -usually because batch sizes and learning rates are wrong, or you need vastly more data. Substantial -practical experience is required to get an intuition for what is possible with which amount of data -for a given problem. Reproducing papers is extremely difficult. From a user perspective, the -expectation should be that to get an algorithm to work on *any* problem, hyper-parameter tuning -is required. - -That being said, there are small implementation issues in some of the Q-models due to the move to -full TensorFlow code, and the current recommendation is to use PPO unless there is a good reason -not to. - - -### 3. Can you implement paper X? - -We get many feature requests and the answer to most is "maybe". Reinforcement learning is a very -active, fast moving field, but at the same time very immature as a technology. This means most new -approaches will likely not be practically relevant going forward, this is the nature of research, -even though these approaches inform the development of more mature algorithms later. - -Further, new approaches are often unnecessarily complex, which often only becomes clear in -hindsight. For example, PPO both performs much better than TRPO and is much simpler to implement. -TensorForce is not meant to be a collection of every new available trick. This is in particular not -possible due to the architecture choices we have made to design full TensorFlow reinforcement -learning graphs. Integrating new techniques into this architecture tends to require *much* higher -effort than just implementing the new method from scratch in a separate script. - -For this reason, we mostly do not implement new papers straight away unless they are extremely -convincing and have a good implementation difficulty to return ratio. - - -### 4. How can I use an evolutionary or natural gradient optimizer? - -By changing the type of the optimizer where appropriate. For example, a vanilla policy gradient may -use an evolutionary optimizer via: - -```python -optimizer=dict( - type='evolutionary', - learning_rate=1e-2 -) -``` - -and a natural gradient optimizer via: - -```python -optimizer=dict( - type='natual_gradient', - learning_rate=1e-2 -) -``` - -Please note that not every model can sensibly make use of every optimizer. Read up on the -individual model, for example, TRPO is by default a natural gradient model. - - -### 5. What is deterministic mode? How does it relate to evaluation? - -The deterministic flag on ```act``` only concerns the action selection. It does -not affect whether training is performed or not. Training is controlled via -```observe``` calls. The deterministic flag is relevant with regard to stochastic -and deterministic policies. For example, policy gradient models typically -assume a stochastic policy during training (unless when using deterministic policy -gradients). Q-models, which we have also implemented inheriting from a distribution -model (see our blog posts on architecture), deterministically sample their action -via their greedy maximisation. - -When evaluating the final trained model, one would typically act deterministically to -avoid sampling random actions. - -### 6. How do I specify multiple actions? - -Actions can either be specified as single actions which just require a dict specifying -a ```type``` and ```shape```, or a ```num_actions``` parameter in the case of discrete actions. -A ```shape``` can be used to specify multiple actions of the same type, e.g. -```actions=dict(type='float', shape=(5,), min=-2, max=2)``` specifies 5 float actions -with the same range. Please note that using the min/max parameter results in -a Beta distribution instead of a Gaussian distribution being used for continuous -actions. If a single action is desired, use ```shape=()```. - -The parameter ```num_actions``` is only used for discrete -actions and describes the number of different options for a single discrete action. That is, -```dict(type='int', shape=(2,), num_actions=4)``` means the agent will output two actions -per step, each giving values in [0, 1, 2, 3]. - -There are no restrictions on combining multiple action types. Available action types -are ```float,int,bool```. In the case of multiple actions, the agent expects a dict containing -action names and their types. For example, - -```python -actions=dict( - discrete_action=dict(type='int', num_actions=15), - continous_action=dict(type='float', shape=()), -) -``` - -will lead to the agent outputting the following action dict (example action): - -```python -actions = agent.act(states=state) - ->>> print actions ->>> dict(discrete_action=3, continuous_action=0.332) -``` - -In the case of multiple actions, the values will be arrays. It is a good idea -to use expressive action names within your application to keep track of -each action and its purpose. - - - -### 7. I really need a certain feature or help with my application. - -Please contact ```contact@reinforce.io``` for commercial support. diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index ea5ac4be6..000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,10 +0,0 @@ -pipeline { - agent { docker 'python:3.5.1' } - stages { - stage('build') { - steps { - sh 'python --version' - } - } - } -} diff --git a/LICENSE b/LICENSE index d79d52b0f..6115b6303 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2017 reinforce.io. All Rights Reserved. +Copyright 2018 Tensorforce Team. All Rights Reserved. Apache License Version 2.0, January 2004 @@ -188,7 +188,7 @@ Copyright 2017 reinforce.io. All Rights Reserved. same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2017 reinforce.io. + Copyright 2018 Tensorforce Team. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -200,4 +200,4 @@ Copyright 2017 reinforce.io. All Rights Reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index bb3ec5f0d..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include README.md diff --git a/PROJECTS.md b/PROJECTS.md new file mode 100644 index 000000000..b7f4cb8aa --- /dev/null +++ b/PROJECTS.md @@ -0,0 +1,90 @@ +# Projects using Tensorforce + +Please [get in touch](mailto:tensorforce.team@gmail.com) if you would like your project to be listed here. + + + + +### Active flow control + + +Tensorforce has been used within fluid mechanics to perform active flow control. Active flow control is known to be challenging due the combination of non linearity, high dimensionality, and time dependence implied by fluid mechanics, and therefore DRL is a promising new tool within this research field. + +This project performs flow control of the 2D Kármán Vortex Street with Deep Reinforcement Learning. The simulations are done with FEniCS, while the Reinforcement Learning is performed with the help of the library Tensorforce. You will need Fenics, Tensorflow, Tensorforce and Gmsh available on your system in order to be able to run the code. + +[Paper](https://arxiv.org/abs/1808.07664)  |  [GitHub Project](https://github.com/jerabaul29/Cylinder2DFlowControlDRL) + + + + +### DeepCrawl + + +DeepCrawl is a turn-based strategy game for mobile platforms, where all the enemies are trained with Deep Reinforcement Learning algorithms. The game is designed to be hard, yet fair: the player will have to explore the dungeons and defeat all the guardians of the rooms, paying attention to every moves the AI does! + +The game was developed in Unity, while the AI was built through Tensorforce and Unity ML-Agents. + +The project was part of a Master thesis in Computer Engineering at Università degli Studi di Firenze, with title *"DeepCrawl: Deep Reinforcement Learning for turn-based strategy games"*. + +[GitHub Project](https://github.com/SestoAle/DeepCrawl) + + + + +### SimPyFab + + +Complex job shop manufacturing systems are motivated by the manufacturing characteristics of the semiconductor wafer fabrication. A job shop consists of several machines (processing resources) that process jobs (products, orders) based on a defined list or process steps. After every process, the job is dispatched and transported to the next processing machine. Machines are usually grouped in sub-areas by the type processing type, i.e. similar processing capabilities are next to each other. + +This framework provides an integrated simulation and reinforcement learning model to investigate the potential of data-driven reinforcement learning in production planning and control of complex job shop systems. The simulation model allows parametrization of a broad range of job shop-like manufacturing systems. Furthermore, performance statistics and logging of performance indicators are provided. Reinforcement learning is implemented to control the order dispatching and several dispatchin heuristics provide benchmarks that are used in practice. + +[GitHub Project](https://github.com/AndreasKuhnle/SimRLFab) + + + + +### Navbot: Using RGB Image as Visual Input for Mapless Robot Navigation + + +A collection for mapless robot navigation using RGB image as visual input. It contains the test environment and motion planners, aiming at realizing all the three levels of mapless navigation: + +1. memorizing efficiently; +2. from memorizing to reasoning; +3. more powerful reasoning + +[GitHub Project](https://github.com/marooncn/navbot) + + + + +### Adaptive Behavior Generation for Autonomous Driving + + +Making the right decision in traffic is a challenging task that is highly dependent on individual preferences as well as the surrounding environment. Therefore it is hard to model solely based on expert knowledge. In this work we use Deep Reinforcement Learning to learn maneuver decisions based on a compact semantic state representation. This ensures a consistent model of the environment across scenarios as well as a behavior adaptation function, enabling on-line changes +of desired behaviors without re-training. The input for the neural network is a simulated object list similar to that of Radar or Lidar sensors, superimposed by a relational semantic scene description. The state as well as the reward are extended by a behavior adaptation function and a parameterization respectively. With little expert knowledge and a set of mid-level actions, it can be seen that the agent is capable to adhere to traffic rules and learns to drive safely in a variety of situations + +[Paper](https://arxiv.org/abs/1809.03214) + + + + +### Bitcoin trading bot + + +This project is a Tensorforce-based Bitcoin trading bot (algo-trader). It uses deep reinforcement learning to automatically buy/sell/hold BTC based on what it learns about BTC price history. Most blogs / tutorials / boilerplate BTC trading-bots you'll find out there use supervised machine learning, likely an LTSM. That's well and good - supervised learning learns what makes a time-series tick so it can predict the next-step future. But that's where it stops. It says "the price will go up next", but it doesn't tell you what to do. Well that's simple, buy, right? Ah, buy low, sell high - it's not that simple. Thousands of lines of code go into trading rules, "if this then that" style. Reinforcement learning takes supervised to the next level - it embeds supervised within its architecture, and then decides what to do. It's beautiful stuff! + +This project goes with Episode 26+ of [Machine Learning Guide](http://ocdevel.com/mlg). Those episodes are tutorial for this project; including an intro to Deep RL, hyperparameter decisions, etc. + +[GitHub Project](https://github.com/lefnire/tforce_btc_trader) + + + + +### TensorTrade: Trade Efficiently with Reinforcement Learning + + +TensorTrade is an open source Python framework for building, training, evaluating, and deploying robust trading algorithms using reinforcement learning. The framework focuses on being highly composable and extensible, to allow the system to scale from simple trading strategies on a single CPU, to complex investment strategies run on a distribution of HPC machines. + +Under the hood, the framework uses many of the APIs from existing machine learning libraries to maintain high quality data pipelines and learning models. One of the main goals of TensorTrade is to enable fast experimentation with algorithmic trading strategies, by leveraging the existing tools and pipelines provided by numpy, pandas, gym, keras, and tensorflow. + +[GitHub Project](https://github.com/notadamking/tensortrade) diff --git a/README.md b/README.md index fe515b587..b258e4145 100644 --- a/README.md +++ b/README.md @@ -1,239 +1,237 @@ -TensorForce: A TensorFlow library for applied reinforcement learning -==================================================================== +# Tensorforce: a TensorFlow library for applied reinforcement learning [![Docs](https://readthedocs.org/projects/tensorforce/badge)](http://tensorforce.readthedocs.io/en/latest/) -[![Gitter](https://badges.gitter.im/reinforceio/TensorForce.svg)](https://docs.google.com/forms/d/1_UD5Pb5LaPVUviD0pO0fFcEnx_vwenvuc00jmP2rRIc/) -[![Build Status](https://travis-ci.org/reinforceio/tensorforce.svg?branch=master)](https://travis-ci.org/reinforceio/tensorforce) -[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/reinforceio/tensorforce/blob/master/LICENSE) - -Introduction ------------- - -TensorForce is an open source reinforcement learning library focused on -providing clear APIs, readability and modularisation to deploy -reinforcement learning solutions both in research and practice. -TensorForce is built on top of TensorFlow and compatible with Python 2.7 -and >3.5 and supports multiple state inputs and multi-dimensional -actions to be compatible with any type of simulation or application environment. - -TensorForce also aims to move all reinforcement learning logic into the -TensorFlow graph, including control flow. This both reduces dependencies -on the host language (Python), thus enabling portable computation graphs that -can be used in other languages and contexts, and improves performance. - -More information on architecture can also be found [on our blog](https://reinforce.io/blog/). -Please also read the [TensorForce FAQ](https://github.com/reinforceio/tensorforce/blob/master/FAQ.md) -if you encounter problems or have questions. - -Finally, read the latest update notes (UPDATE_NOTES.md) for an idea of -how the project is evolving, especially concerning majorAPI breaking updates. -We recently (20th February) merged a major branch which moves memories -and all remaining structures into TensorFlow variables. This causes a number -of breaking API change (see updated configurations, examples, and tests), but -improves performance. It further enables more expressive update semantics, -e.g. episode based instead of fixed time step based. - -The main difference to existing libraries is a strict separation of -environments, agents and update logic that facilitates usage in -non-simulation environments. Further, research code often relies on -fixed network architectures that have been used to tackle particular -benchmarks. TensorForce is built with the idea that (almost) everything -should be optionally configurable and in particular uses value function -template configurations to be able to quickly experiment with new -models. The goal of TensorForce is to provide a practitioner's -reinforcement learning framework that integrates into modern software -service architectures. - -TensorForce is actively being maintained and developed both to -continuously improve the existing code as well as to reflect new -developments as they arise. The aim is not to -include every new trick but to adopt methods as -they prove themselves stable. - -Features --------- - -TensorForce currently integrates with the OpenAI Gym API, OpenAI -Universe, DeepMind lab, ALE and Maze explorer. The following algorithms are available (all -policy methods both continuous/discrete and using a Beta distribution for bounded actions). - -- A3C using distributed TensorFlow or a multithreaded runner - now as part of our generic Model - usable with different agents. - [paper](https://arxiv.org/pdf/1602.01783.pdf) -- Trust Region Policy Optimization (TRPO) - ```trpo_agent``` - [paper](https://arxiv.org/abs/1502.05477) -- Normalised Advantage functions (NAFs) - ```naf_agent``` - [paper](https://arxiv.org/pdf/1603.00748.pdf) -- DQN - ```dqn_agent``` - [paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) -- Double-DQN - ```ddqn_agent``` - [paper](https://arxiv.org/abs/1509.06461) -- N-step DQN - ```dqn_nstep_agent``` -- Vanilla Policy Gradients (VPG/ REINFORCE) - ```vpg_agent```- [paper](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) -- Actor-critic models - via `baseline` for any policy gradient model (see next list) - [paper]() -- Deep Q-learning from Demonstration (DQFD) - - [paper](https://arxiv.org/abs/1704.03732) -- Proximal Policy Optimisation (PPO) - ```ppo_agent``` - [paper](https://arxiv.org/abs/1707.06347) -- Random and constant agents for sanity checking: ```random_agent```, ```constant_agent``` - -Other heuristics and their respective config key that can be turned on where sensible: - -- Generalized advantage estimation - ```gae_lambda``` - [paper](https://arxiv.org/abs/1506.02438) -- Prioritizied experience replay - memory type ```prioritized_replay``` - [paper](https://arxiv.org/abs/1511.05952) -- Bounded continuous actions are mapped to Beta distributions instead of Gaussians - [paper](http://proceedings.mlr.press/v70/chou17a/chou17a.pdf) -- Baseline / actor-critic modes: Based on raw states (```states```) or on network output (```network```). MLP (```mlp```), CNN (```cnn```) or custom network (```custom```). Special case for mode ```states```: baseline per state + linear combination layer (via ```baseline=dict(state1=..., state2=..., etc)```). -- Generic pure TensorFlow optimizers, most models can be used with natural gradient and evolutionary optimizers -- Preprocessing modes: ```normalize```, ```standardize```, ```grayscale```, ```sequence```, ```clip```, - ```divide```, ```image_resize``` -- Exploration modes: ```constant```,```linear_decay```, ```epsilon_anneal```, ```epsilon_decay```, - ```ornstein_uhlenbeck``` - -Installation ------------- - -We uploaded the latest stable version of TensorForce to PyPI. To install, just execute: +[![Gitter](https://badges.gitter.im/tensorforce/community.svg)](https://gitter.im/tensorforce/community) +[![Build Status](https://travis-ci.com/tensorforce/tensorforce.svg?branch=master)](https://travis-ci.com/tensorforce/tensorforce) +[![pypi version](https://img.shields.io/pypi/v/tensorforce)](https://pypi.org/project/Tensorforce/) +[![python version](https://img.shields.io/pypi/pyversions/tensorforce)](https://pypi.org/project/Tensorforce/) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/tensorforce/tensorforce/blob/master/LICENSE) +[![Donate](https://img.shields.io/badge/donate-GitHub_Sponsors-yellow)](https://github.com/sponsors/AlexKuhnle) +[![Donate](https://img.shields.io/badge/donate-Liberapay-yellow)](https://liberapay.com/TensorforceTeam/donate) + + +**This project is not maintained any longer!** + + +#### Introduction + +Tensorforce is an open-source deep reinforcement learning framework, with an emphasis on modularized flexible library design and straightforward usability for applications in research and practice. Tensorforce is built on top of [Google's TensorFlow framework](https://www.tensorflow.org/) and requires Python 3. + +Tensorforce follows a set of high-level design choices which differentiate it from other similar libraries: + +- **Modular component-based design**: Feature implementations, above all, strive to be as generally applicable and configurable as possible, potentially at some cost of faithfully resembling details of the introducing paper. +- **Separation of RL algorithm and application**: Algorithms are agnostic to the type and structure of inputs (states/observations) and outputs (actions/decisions), as well as the interaction with the application environment. +- **Full-on TensorFlow models**: The entire reinforcement learning logic, including control flow, is implemented in TensorFlow, to enable portable computation graphs independent of application programming language, and to facilitate the deployment of models. + + + +#### Quicklinks + +- [Documentation](http://tensorforce.readthedocs.io) and [update notes](https://github.com/tensorforce/tensorforce/blob/master/UPDATE_NOTES.md) +- [Contact](mailto:tensorforce.team@gmail.com) and [Gitter channel](https://gitter.im/tensorforce/community) +- [Benchmarks](https://github.com/tensorforce/tensorforce/blob/master/benchmarks) and [projects using Tensorforce](https://github.com/tensorforce/tensorforce/blob/master/PROJECTS.md) +- [Roadmap](https://github.com/tensorforce/tensorforce/blob/master/ROADMAP.md) and [contribution guidelines](https://github.com/tensorforce/tensorforce/blob/master/CONTRIBUTING.md) +- [GitHub Sponsors](https://github.com/sponsors/AlexKuhnle) and [Liberapay](https://liberapay.com/TensorforceTeam/donate) + + + +#### Table of content + +- [Installation](#installation) +- [Quickstart example code](#quickstart-example-code) +- [Command line usage](#command-line-usage) +- [Features](#features) +- [Environment adapters](#environment-adapters) +- [Support, feedback and donating](#support-feedback-and-donating) +- [Core team and contributors](#core-team-and-contributors) +- [Cite Tensorforce](#cite-tensorforce) + + + +## Installation + +A stable version of Tensorforce is periodically updated on PyPI and installed as follows: ```bash -pip install tensorforce +pip3 install tensorforce ``` -If you want to use the latest version from GitHub, use: - +To always use the latest version of Tensorforce, install the GitHub version instead: ```bash -git clone git@github.com:reinforceio/tensorforce.git -cd tensorforce -pip install -e . +git clone https://github.com/tensorforce/tensorforce.git +pip3 install -e tensorforce ``` -TensorForce is built on [Google's Tensorflow](https://www.tensorflow.org/). The installation command assumes -that you have `tensorflow` or `tensorflow-gpu` installed. +**Note on installation on M1 Macs:** At the moment Tensorflow, which is a core dependency of Tensorforce, cannot be installed on M1 Macs directly. Follow the ["M1 Macs" section](https://tensorforce.readthedocs.io/en/latest/basics/installation.html) in the documentation for a workaround. -Alternatively, you can use the following commands to install the tensorflow dependency. +Environments require additional packages for which there are setup options available (`ale`, `gym`, `retro`, `vizdoom`, `carla`; or `envs` for all environments), however, some require additional tools to be installed separately (see [environments documentation](http://tensorforce.readthedocs.io)). Other setup options include `tfa` for [TensorFlow Addons](https://www.tensorflow.org/addons) and `tune` for [HpBandSter](https://github.com/automl/HpBandSter) required for the `tune.py` script. -To install TensorForce with `tensorflow` (cpu), use: +**Note on GPU usage:** Different from (un)supervised deep learning, RL does not always benefit from running on a GPU, depending on environment and agent configuration. In particular for environments with low-dimensional state spaces (i.e., no images), it is hence worth trying to run on CPU only. -```bash -# PyPI install -pip install tensorforce[tf] -# Local install -pip install -e .[tf] -``` -To install TensorForce with `tensorflow-gpu` (gpu), use: +## Quickstart example code -```bash -# PyPI install -pip install tensorforce[tf_gpu] +```python +from tensorforce import Agent, Environment + +# Pre-defined or custom environment +environment = Environment.create( + environment='gym', level='CartPole', max_episode_timesteps=500 +) -# Local install -pip install -e .[tf_gpu] +# Instantiate a Tensorforce agent +agent = Agent.create( + agent='tensorforce', + environment=environment, # alternatively: states, actions, (max_episode_timesteps) + memory=10000, + update=dict(unit='timesteps', batch_size=64), + optimizer=dict(type='adam', learning_rate=3e-4), + policy=dict(network='auto'), + objective='policy_gradient', + reward_estimation=dict(horizon=20) +) + +# Train for 300 episodes +for _ in range(300): + + # Initialize episode + states = environment.reset() + terminal = False + + while not terminal: + # Episode timestep + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + +agent.close() +environment.close() ``` -To update TensorForce, use `pip install --upgrade tensorforce` for the PyPI -version, or run `git pull` in the tensorforce directory if you cloned the -GitHub repository. -Please note that we did not include OpenAI Gym/Universe/DeepMind lab in -the default install script because not everyone will want to use these. -Please install them as required, usually via pip. -Examples and documentation --------------------------- -For a quick start, you can run one of our example scripts using the -provided configurations, e.g. to run the TRPO agent on CartPole, execute -from the examples folder: +## Command line usage + +Tensorforce comes with a range of [example configurations](https://github.com/tensorforce/tensorforce/tree/master/benchmarks/configs) for different popular reinforcement learning environments. For instance, to run Tensorforce's implementation of the popular [Proximal Policy Optimization (PPO) algorithm](https://arxiv.org/abs/1707.06347) on the [OpenAI Gym CartPole environment](https://gym.openai.com/envs/CartPole-v1/), execute the following line: ```bash -python examples/openai_gym.py CartPole-v0 -a examples/configs/ppo.json -n examples/configs/mlp2_network.json +python3 run.py --agent benchmarks/configs/ppo.json --environment gym \ + --level CartPole-v1 --episodes 100 ``` -Documentation is available at -[ReadTheDocs](http://tensorforce.readthedocs.io). We also have tests -validating models on minimal environments which can be run from the main -directory by executing `pytest`{.sourceCode}. +For more information check out the [documentation](http://tensorforce.readthedocs.io). -Create and use agents ---------------------- -To use TensorForce as a library without using the pre-defined simulation -runners, simply install and import the library, then create an agent and -use it as seen below (see documentation for all optional parameters): -```python -from tensorforce.agents import PPOAgent - -# Create a Proximal Policy Optimization agent -agent = PPOAgent( - states_spec=dict(type='float', shape=(10,)), - actions_spec=dict(type='int', num_actions=10), - network_spec=[ - dict(type='dense', size=64), - dict(type='dense', size=64) - ], - batch_size=1000, - step_optimizer=dict( - type='adam', - learning_rate=1e-4 - ) -) +## Features -# Get new data from somewhere, e.g. a client to a web app -client = MyClient('http://127.0.0.1', 8080) +- **Network layers**: Fully-connected, 1- and 2-dimensional convolutions, embeddings, pooling, RNNs, dropout, normalization, and more; *plus* support of Keras layers. +- **Network architecture**: Support for multi-state inputs and layer (block) reuse, simple definition of directed acyclic graph structures via register/retrieve layer, plus support for arbitrary architectures. +- **Memory types**: Simple batch buffer memory, random replay memory. +- **Policy distributions**: Bernoulli distribution for boolean actions, categorical distribution for (finite) integer actions, Gaussian distribution for continuous actions, Beta distribution for range-constrained continuous actions, multi-action support. +- **Reward estimation**: Configuration options for estimation horizon, future reward discount, state/state-action/advantage estimation, and for whether to consider terminal and horizon states. +- **Training objectives**: (Deterministic) policy gradient, state-(action-)value approximation. +- **Optimization algorithms**: Various gradient-based optimizers provided by TensorFlow like Adam/AdaDelta/RMSProp/etc, evolutionary optimizer, natural-gradient-based optimizer, plus a range of meta-optimizers. +- **Exploration**: Randomized actions, sampling temperature, variable noise. +- **Preprocessing**: Clipping, deltafier, sequence, image processing. +- **Regularization**: L2 and entropy regularization. +- **Execution modes**: Parallelized execution of multiple environments based on Python's `multiprocessing` and `socket`. +- **Optimized act-only SavedModel extraction**. +- **TensorBoard support**. -# Poll new state from client -state = client.get_state() +By combining these modular components in different ways, a variety of popular deep reinforcement learning models/features can be replicated: -# Get prediction from agent, execute -action = agent.act(state) -reward = client.execute(action) +- Q-learning: [Deep Q-learning](https://www.nature.com/articles/nature14236), [Double-DQN](https://arxiv.org/abs/1509.06461), [Dueling DQN](https://arxiv.org/abs/1511.06581), [n-step DQN](https://arxiv.org/abs/1602.01783), [Normalised Advantage Function (NAF)](https://arxiv.org/abs/1603.00748) +- Policy gradient: [vanilla policy-gradient / REINFORCE](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf), [Actor-critic and A3C](https://arxiv.org/abs/1602.01783), [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347), [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477), [Deterministic Policy Gradient](https://arxiv.org/abs/1509.02971) -# Add experience, agent automatically updates model according to batch size -agent.observe(reward=reward, terminal=False) -``` +Note that in general the replication is not 100% faithful, since the models as described in the corresponding paper often involve additional minor tweaks and modifications which are hard to support with a modular design (and, arguably, also questionable whether it is important/desirable to support them). On the upside, these models are just a few examples from the multitude of module combinations supported by Tensorforce. + + + +## Environment adapters + +- [Arcade Learning Environment](https://github.com/mgbellemare/Arcade-Learning-Environment), a simple object-oriented framework that allows researchers and hobbyists to develop AI agents for Atari 2600 games. +- [CARLA](https://github.com/carla-simulator/carla), is an open-source simulator for autonomous driving research. +- [OpenAI Gym](https://gym.openai.com/), a toolkit for developing and comparing reinforcement learning algorithms which supports teaching agents everything from walking to playing games like Pong or Pinball. +- [OpenAI Retro](https://github.com/openai/retro), lets you turn classic video games into Gym environments for reinforcement learning and comes with integrations for ~1000 games. +- [OpenSim](http://osim-rl.stanford.edu/), reinforcement learning with musculoskeletal models. +- [PyGame Learning Environment](https://github.com/ntasfi/PyGame-Learning-Environment/), learning environment which allows a quick start to Reinforcement Learning in Python. +- [ViZDoom](https://github.com/mwydmuch/ViZDoom), allows developing AI bots that play Doom using only the visual information. -Benchmarks ----------- -We provide a seperate repository for benchmarking our algorithm implementations at -[reinforceio/tensorforce-benchmark](https://github.com/reinforceio/tensorforce-benchmark). +## Support, feedback and donating -Docker containers for benchmarking (CPU and GPU) are available. +Please get in touch via [mail](mailto:tensorforce.team@gmail.com) or on [Gitter](https://gitter.im/tensorforce/community) if you have questions, feedback, ideas for features/collaboration, or if you seek support for applying Tensorforce to your problem. -This is a sample output for `CartPole-v0`, comparing VPG, TRPO and PPO: +If you want to support the Tensorforce core team (see below), please also consider donating: [GitHub Sponsors](https://github.com/sponsors/AlexKuhnle) or [Liberapay](https://liberapay.com/TensorforceTeam/donate). -![example output](https://user-images.githubusercontent.com/14904111/29328011-52778284-81f1-11e7-8f70-6554ca9388ed.png) -Please refer to the [tensorforce-benchmark](https://github.com/reinforceio/tensorforce-benchmark) repository -for more information. +## Core team and contributors -Community and contributions ---------------------------- +Tensorforce is currently developed and maintained by [Alexander Kuhnle](https://github.com/AlexKuhnle). -TensorForce is developed by [reinforce.io](https://reinforce.io), a new -project focused on providing reinforcement learning software -infrastructure. For any questions, get in touch at -. +Earlier versions of Tensorforce (<= 0.4.2) were developed by [Michael Schaarschmidt](https://github.com/michaelschaarschmidt), [Alexander Kuhnle](https://github.com/AlexKuhnle) and [Kai Fricke](https://github.com/krfricke). -Please file bug reports and feature discussions as GitHub issues in first instance. +The advanced parallel execution functionality was originally contributed by Jean Rabault (@jerabaul29) and Vincent Belus (@vbelus). Moreover, the pretraining feature was largely developed in collaboration with Hongwei Tang (@thw1021) and Jean Rabault (@jerabaul29). -There is also a developer chat you are welcome to join. For joining, we ask to provide -some basic details how you are using TensorForce so we can learn more about applications and our -community. Please fill in [this short form](https://docs.google.com/forms/d/1_UD5Pb5LaPVUviD0pO0fFcEnx_vwenvuc00jmP2rRIc/) which will take - you to the chat after. +The CARLA environment wrapper is currently developed by Luca Anzalone (@luca96). -Cite ----- +We are very grateful for our open-source contributors (listed according to Github, updated periodically): -If you use TensorForce in your academic research, we would be grateful if you could cite it as follows: +Islandman93, sven1977, Mazecreator, wassname, lefnire, daggertye, trickmeyer, mkempers, +mryellow, ImpulseAdventure, +janislavjankov, andrewekhalel, +HassamSheikh, skervim, +beflix, coord-e, +benelot, tms1337, vwxyzjn, erniejunior, +Deathn0t, petrbel, nrhodes, batu, yellowbee686, tgianko, +AdamStelmaszczyk, BorisSchaeling, christianhidber, Davidnet, ekerazha, gitter-badger, kborozdin, Kismuz, mannsi, milesmcc, nagachika, neitzal, ngoodger, perara, sohakes, tomhennigan. + + + +## Cite Tensorforce + +Please cite the framework as follows: ``` -@misc{schaarschmidt2017tensorforce, - author = {Schaarschmidt, Michael and Kuhnle, Alexander and Fricke, Kai}, - title = {TensorForce: A TensorFlow library for applied reinforcement learning}, - howpublished={Web page}, - url = {https://github.com/reinforceio/tensorforce}, - year = {2017} +@misc{tensorforce, + author = {Kuhnle, Alexander and Schaarschmidt, Michael and Fricke, Kai}, + title = {Tensorforce: a TensorFlow library for applied reinforcement learning}, + howpublished = {Web page}, + url = {https://github.com/tensorforce/tensorforce}, + year = {2017} } ``` -We are also very grateful for our open source contributors (listed according to github): Islandman93, wassname, -Mazecreator, lefnire, sven1977, trickmeyer, mryellow, ImpulseAdventure, vwxyzjn, beflix, tms1337, BorisSchaeling, ngoodger, -ekerazha, Davidnet, nikoliazekter, AdamStelmaszczyk, 10nagachika, petrbel, Kismuz. +If you use the [parallel execution functionality](https://github.com/tensorforce/tensorforce/tree/master/tensorforce/contrib), please additionally cite it as follows: + +``` +@article{rabault2019accelerating, + title = {Accelerating deep reinforcement learning strategies of flow control through a multi-environment approach}, + author = {Rabault, Jean and Kuhnle, Alexander}, + journal = {Physics of Fluids}, + volume = {31}, + number = {9}, + pages = {094105}, + year = {2019}, + publisher = {AIP Publishing} +} +``` + +If you use Tensorforce in your research, you may additionally consider citing the following paper: + +``` +@article{lift-tensorforce, + author = {Schaarschmidt, Michael and Kuhnle, Alexander and Ellis, Ben and Fricke, Kai and Gessert, Felix and Yoneki, Eiko}, + title = {{LIFT}: Reinforcement Learning in Computer Systems by Learning From Demonstrations}, + journal = {CoRR}, + volume = {abs/1808.07903}, + year = {2018}, + url = {http://arxiv.org/abs/1808.07903}, + archivePrefix = {arXiv}, + eprint = {1808.07903} +} +``` diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 000000000..d047b3bd1 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,27 @@ +# Roadmap + +1. RNN policies + - Extend configurability + - Allow recurrent baselines + - More RNN modules, incl Transformer +2. Reward estimation extensions + - Auxiliary losses + - Curiosity + - Imitation learning + - Distributional perspective +3. State/action modeling + - Sequence states/actions + - State-dependent actions + - Conditional/hierarchical actions +4. Memory architecture + - Optimize retrieval of sequences + - Use TensorArray + - Improve other limitations +5. CARLA environment + - Docs and assertions + - World's map loading (e.g. random, specific, etc.) + - Weather support + - Pretraining and Free play (e.g. for data collection) + - State space with a temporal component. + - ... +6. To be determined... diff --git a/UPDATE_NOTES.md b/UPDATE_NOTES.md index d601e5d04..51ccab3dc 100644 --- a/UPDATE_NOTES.md +++ b/UPDATE_NOTES.md @@ -1,181 +1,373 @@ -Update notes ------------- +# Update notes -This file tracks all major updates and new features. As TensorForce is still in alpha, -we are continuously implementing small updates and bug fixes, which will not -be tracked here in detail but through github issues. +This file records all major updates and new features, starting from version 0.5. As Tensorforce is still developing, updates and bug fixes for the internal architecture are continuously being implemented, which will not be tracked here in detail. -20th February -- Merge of memory branch with major library-internal changes: core.memories module, memory_model, - adapted handling of batching/losses/optimization, etc. -- Updated and standardized agent parameters, see documentation of agent classes. -14th January +### Latest changes -- Reverted back deprecated API call to be compatible with 1.4.1 in version 0.3.6.1 -12th January +--- -- Implemented some hot-fixes following changes in TensorFlow regarding variable registration. - These changes (first observed in 1.4) caused our custom getters for tf.make_template to register - variables differently, thus sometimes causing double registration in our variable lists. - The latest pip version 0.3.5 combined with TensorFlow 1.5.0rc0 address these issues. -6th January +### Version 0.6.5 -- In December, a number of bugs regarding exploration and a numberical issue in generalised - advantage estimation were fixed which seem to increase performance so an update is recommended. -- Agent structure saw major refactoring to remove redundant code, introduced a ```LearningAgent``` - to hold common fields and distinguish from non-learning agents (e.g. ```RandomAgent```) -- We are preparing to move memories into the TensorFlow graph which will fix sequences and allow subsampling - in the optimizers. Further, new episode/batch semantics will be enabled (e.g. episode based instead of - timestep based batching). +##### Agents: +- Renamed agent argument `reward_preprocessing` to `reward_processing`, and in case of Tensorforce agent moved to `reward_estimation[reward_processing]` -9th December 2017 +##### Distributions: +- New `categorical` distribution argument `skip_linear` to not add the implicit linear logits layer -- Renamed LSTM to InternalLSTM and created a new LSTM layer which implements more standard - sequence functionality. The ```internal_lstm``` is used for internal agent state, while - ```lstm``` may be used for seq2seq problems. +##### Environments: +- Support for multi-actor parallel environments via new function `Environment.num_actors()` + - `Runner` uses multi-actor parallelism by default if environment is multi-actor +- New optional `Environment` function `episode_return()` which returns the true return of the last episode, if cumulative sum of environment rewards is not a good metric for runner display -2nd December 2017 +##### Examples: +- New `vectorized_environment.py` and `multiactor_environment.py` script to illustrate how to setup a vectorized/multi-actor environment. -- Sequence preprocessor temporarily broken; use version 0.3.2 if required. This is because sequence sampling - in TensorFlow is only sensibly possible once replay memories/batches have also been moved into TensorFlow. -- Moved pre-processing and exploration from agent (in Python logic) to TensorFlow control flow in model -11th November 2017 +--- -- BREAKING: We removed the Configuration object. Most users feel named arguments are far more - comfortable to handle. Agents are now created specifying all non-default paremeters explicitly, - see quickstart examples. -- Agents are now specified as part of the configuration via a 'type', e.g. "type" : "dqn_agent" -8th November 2017 +### Version 0.6.4 -- Layers/networks/etc now take an additional argument `update` in `tf_apply`, a boolean tensor indicating whether the call happens during an update. +##### Agents: +- Agent argument `update_frequency` / `update[frequency]` now supports float values > 0.0, which specify the update-frequency relative to the batch-size +- Changed default value for argument `update_frequency` from `1.0` to `0.25` for DQN, DoubleDQN, DuelingDQN agents +- New argument `return_processing` and `advantage_processing` (where applicable) for all agent sub-types +- New function `Agent.get_specification()` which returns the agent specification as dictionary +- New function `Agent.get_architecture()` which returns a string representation of the network layer architecture -7th November 2017 +##### Modules: +- Improved and simplified module specification, for instance: `network=my_module` instead of `network=my_module.TestNetwork`, or `environment=envs.custom_env` instead of `environment=envs.custom_env.CustomEnvironment` (module file needs to be in the same directory or a sub-directory) -- New saver/summary/distributed config interface via entries `saver_spec`, `summary_spec`, `distributed_spec`. -- The first two require at least a `directory` value. -- Automatically periodically saves model/summaries with `seconds` in respective `_spec` set. +##### Networks: +- New argument `single_output=True` for some policy types which, if `False`, allows the specification of additional network outputs for some/all actions via registered tensors +- `KerasNetwork` argument `model` now supports arbitrary functions as long as they return a `tf.keras.Model` -22nd October 2017 +##### Layers: +- New layer type `SelfAttention` (specification key: `self_attention`) -- BREAKING: We released a complete redesign including our new optimization module. Optimizers - which previously were only available in Python (natural gradients) are now available in pure - TensorFlow. A blogpost on this will appear soon. -- Agent configurations are now decomposed in (```action_spec```, ```states_spec```,```network_spec```, and config). - This facilitates a more clear separation between hyperparameters of the model and describing the problem. -- Models are now heavily making use of templated graph construction. -- Policy gradient models have been decomposed in models using likelihood ratios and log - likelihood (```pg_prob_ratio_model```) and (```pg_log_prob_model```) -- Q-models are now implemented as distributional models, which enables the use of natural - gradients in Q-models. A blogpost on the practical implications is also on the way. -- Baselines: It is now possible to share parameters between main networks and baselines via - the baseline option (```NetworkBaseline```). -- Actions now support boolean types. +##### Parameters: +- Support tracking of non-constant parameter values -2nd September 2017 +##### Runner: +- Rename attribute `episode_rewards` as `episode_returns`, and TQDM status `reward` as `return` +- Extend argument `agent` to support `Agent.load()` keyword arguments to load an existing agent instead of creating a new one. -- Added multi-LSTM support -- Fixed various bugs around reporting and logging -- Introduced CNN baseline -- Added baseline support for multiple states (experimental). Every state gets its own baseline - and predictions are averaged +##### Examples: +- Added `action_masking.py` example script to illustrate an environment implementation with built-in action masking. -13th August 2017 +##### Buxfixes: +- Customized device placement was not applied to most tensors -- Fixed PPO performance issues, which we now recommend as the default -- Implemented Beta distribution for bounded actions -- Added n-step DQN and multithreaded runner -- Fixed wrong internal calculation of `prob_ratio` and `kl_divergence` in TRPO/PPO -- Added `next_internals` functionality to memories and QModel -- Changed config value names related to advantage estimation to `gae_rewards` and `normalize_rewards` +--- -3rd August 2017 -- Added `ls_accept_ratio=0.01` and adapted names of other TRPO config parameters related to line search -- Various bugs in Categorical DQN and Q-model target network scope fixed by @Islandman93 -- Refactored distributions, categorical now using Gumbel-softmax +### Version 0.6.3 -29th July 2017 +##### Agents: +- New agent argument `tracking` and corresponding function `tracked_tensors()` to track and retrieve the current value of predefined tensors, similar to `summarizer` for TensorBoard summaries +- New experimental value `trace_decay` and `gae_decay` for Tensorforce agent argument `reward_estimation`, soon for other agent types as well +- New options `"early"` and `"late"` for value `estimate_advantage` of Tensorforce agent argument `reward_estimation` +- Changed default value for `Agent.act()` argument `deterministic` from `False` to `True` -- Added `QModel` as base class for DQN (hence DQFD) and NAF -- Added `next_state` placeholder to `QModel`, and boolean flag to `Memory.get_batch` to include next states -- `Configuration` now keeps track of which values were accessed, and `Agent` reports warning if not all were accessed +##### Networks: +- New network type `KerasNetwork` (specification key: `keras`) as wrapper for networks specified as Keras model +- Passing a Keras model class/object as policy/network argument is automatically interpreted as `KerasNetwork` +##### Distributions: +- Changed `Gaussian` distribution argument `global_stddev=False` to `stddev_mode='predicted'` +- New `Categorical` distribution argument `temperature_mode=None` -28th July 2017 +##### Layers: +- New option for `Function` layer argument `function` to pass string function expression with argument "x", e.g. "(x+1.0)/2.0" -- Moved external environments to tensorforce/contrib. The environment module just contains the base environment class and our test environment going forward -- Merged environments ALE and Maze explorer, thanks to Islandman93 and mryellow +##### Summarizer: +- New summary `episode-length` recorded as part of summary label "reward" +##### Environments: +- Support for vectorized parallel environments via new function `Environment.is_vectorizable()` and new argument `num_parallel` for `Environment.reset()` + - See `tensorforce/environments.cartpole.py` for a vectorizable environment example + - `Runner` uses vectorized parallelism by default if `num_parallel > 1`, `remote=None` and environment supports vectorization + - See `examples/act_observe_vectorized.py` for more details on act-observe interaction +- New extended and vectorizable custom CartPole environment via key `custom_cartpole` (work in progress) +- New environment argument `reward_shaping` to provide a simple way to modify/shape rewards of an environment, can be specified either as callable or string function expression -25th July 2017 +##### run.py script: +- New option for command line arguments `--checkpoints` and `--summaries` to add comma-separated checkpoint/summary filename in addition to directory +- Added episode lengths to logging plot besides episode returns -- New optional argument `shape` for action specification, if an array of actions sharing the same specification is required -- Complete and correct mapping of OpenAIGym state/action spaces to corresponding TensorForce state/action specifications -- `MinimalTest` environment extension for multiple actions, plus an additional multi-state/action test for each agent +##### Buxfixes: +- Temporal horizon handling of RNN layers +- Critical bugfix for late horizon value prediction (including DQN variants and DPG agent) in combination with baseline RNN +- GPU problems with scatter operations -23th July 2017 +--- -- Implemented prototype of Proximal Policy Optimisation (PPO) -- Configuration argument network can now take module paths, not just functions -- Fixed prioritized experience replay sampling bug -- Enabling default values for distributions, see https://github.com/reinforceio/tensorforce/issues/34 +### Version 0.6.2 -8th July 2017 +##### Buxfixes: +- Critical bugfix for DQN variants and DPG agent -- BREAKING CHANGE: We modified the act and observe API once more because we think there was -a lack of clarity with regard to which state is observed (current vs next). The agent now internally -manages states and actions in the correct sequence so observe only needs reward and terminal. -- We further introduced a method ```import_observations``` so memory-based agents can preload -data into memory (e.g. if historic data is available). We also added a method ```last_observation``` -on the generic agent which gives the current state, action, reward, terminal and internal state -- Fixed distributed agent mode, should run as intended now -- Fixed target network usage in NAF. Tests now run smoothl -- DQFDAgent now inherits from MemoryAgent +--- -2nd July 2017 -- Fixed lab integration: updated bazel BUILD file with command line options -- Adjusted environment integration to correctly select state and action interfaces -- Changed default agent to VPG since lab mixes continuous and discrete actions +### Version 0.6.1 +##### Agents: +- Removed default value `"adam"` for Tensorforce agent argument `optimizer` (since default optimizer argument `learning_rate` removed, see below) +- Removed option `"minimum"` for Tensorforce agent argument `memory`, use `None` instead +- Changed default value for `dqn`/`double_dqn`/`dueling_dqn` agent argument `huber_loss` from `0.0` to `None` -25h June 2017 +##### Layers: +- Removed default value `0.999` for `exponential_normalization` layer argument `decay` +- Added new layer `batch_normalization` (generally should only be used for the agent arguments `reward_processing[return_processing]` and `reward_processing[advantage_processing]`) +- Added `exponential/instance_normalization` layer argument `only_mean` with default `False` +- Added `exponential/instance_normalization` layer argument `min_variance` with default `1e-4` -- Added prioritised experience replay -- Added RandomAgent for discrete/continuous random baselines -- Moved pre-processing from runner to agent, analogue to exploration +##### Optimizers: +- Removed default value `1e-3` for optimizer argument `learning_rate` +- Changed default value for optimizer argument `gradient_norm_clipping` from `1.0` to `None` (no gradient clipping) +- Added new optimizer `doublecheck_step` and corresponding argument `doublecheck_update` for optimizer wrapper +- Removed `linesearch_step` optimizer argument `accept_ratio` +- Removed `natural_gradient` optimizer argument `return_improvement_estimate` +##### Saver: +- Added option to specify agent argument `saver` as string, which is interpreted as `saver[directory]` with otherwise default values +- Added default value for agent argument `saver[frequency]` as `10` (save model every 10 updates by default) +- Changed default value of agent argument `saver[max_checkpoints]` from `5` to `10` -11th June 2017 +##### Summarizer: +- Added option to specify agent argument `summarizer` as string, which is interpreted as `summarizer[directory]` with otherwise default values +- Renamed option of agent argument `summarizer` from `summarizer[labels]` to `summarizer[summaries]` (use of the term "label" due to earlier version, outdated and confusing by now) +- Changed interpretation of agent argument `summarizer[summaries] = "all"` to include only numerical summaries, so all summaries except "graph" +- Changed default value of agent argument `summarizer[summaries]` from `["graph"]` to `"all"` +- Changed default value of agent argument `summarizer[max_summaries]` from `5` to `7` (number of different colors in TensorBoard) +- Added option `summarizer[filename]` to agent argument `summarizer` -- Fixed bug in DQFD test where demo data was not always the - correct action. Also fixed small bug in DQFD loss (mean over - supervised loss) -- Network entry added to configuration so no separate network builder - has to be passed to the agent constructor (see example) -- The async mode using distributed tensorflow has been merged into the - main model class. See the openai\_gym\_async.py example. In - particular, this means multiple agents are now available in - async mode. N.b. we are still working on making async/distributed - things more convenient to use. -- Fixed bug in NAF where target value (V) was connected to - training output. Also added gradient clipping to NAF because we - observed occasional numerical instability in testing. -- For the same reason, we have altered the tests to always run - multiple times and allow for an occasional failure on travis so our - builds don't get broken by a random initialisation leading to - an under/overflow. -- Updated OpenAI Universe integration to work with our state/action - interface, see an example in examples/openai\_universe.py -- Added convenience method to create Network directly from json - without needing to create a network builder, see examples for usage +##### Recorder: +- Added option to specify agent argument `recorder` as string, which is interpreted as `recorder[directory]` with otherwise default values + +##### run.py script: +- Added `--checkpoints`/`--summaries`/`--recordings` command line argument to enable saver/summarizer/recorder agent argument specification separate from core agent configuration + +##### Examples: +- Added `save_load_agent.py` example script to illustrate regular agent saving and loading + +##### Buxfixes: +- Fixed problem with optimizer argument `gradient_norm_clipping` not being applied correctly +- Fixed problem with `exponential_normalization` layer not updating moving mean and variance correctly +- Fixed problem with `recent` memory for timestep-based updates sometimes sampling invalid memory indices + + +--- + + +### Version 0.6 + +- Removed agent arguments `execution`, `buffer_observe`, `seed` +- Renamed agent arguments `baseline_policy`/`baseline_network`/`critic_network` to `baseline`/`critic` +- Renamed agent `reward_estimation` arguments `estimate_horizon` to `predict_horizon_values`, `estimate_actions` to `predict_action_values`, `estimate_terminal` to `predict_terminal_values` +- Renamed agent argument `preprocessing` to `state_preprocessing` +- Default agent preprocessing `linear_normalization` +- Moved agent arguments for reward/return/advantage processing from `preprocessing` to `reward_preprocessing` and `reward_estimation[return_/advantage_processing]` +- New agent argument `config` with values `buffer_observe`, `enable_int_action_masking`, `seed` +- Renamed PPO/TRPO/DPG argument `critic_network`/`_optimizer` to `baseline`/`baseline_optimizer` +- Renamed PPO argument `optimization_steps` to `multi_step` +- New TRPO argument `subsampling_fraction` +- Changed agent argument `use_beta_distribution` default to false +- Added double DQN agent (`double_dqn`) +- Removed `Agent.act()` argument `evaluation` +- Removed agent function arguments `query` (functionality removed) +- Agent saver functionality changed (Checkpoint/SavedModel instead of Saver/Protobuf): `save`/`load` functions and `saver` argument changed +- Default behavior when specifying `saver` is not to load agent, unless agent is created via `Agent.load` +- Agent summarizer functionality changed: `summarizer` argument changed, some summary labels and other options removed +- Renamed RNN layers `internal_{rnn/lstm/gru}` to `rnn/lstm/gru` and `rnn/lstm/gru` to `input_{rnn/lstm/gru}` +- Renamed `auto` network argument `internal_rnn` to `rnn` +- Renamed `(internal_)rnn/lstm/gru` layer argument `length` to `horizon` +- Renamed `update_modifier_wrapper` to `optimizer_wrapper` +- Renamed `optimizing_step` to `linesearch_step`, and `UpdateModifierWrapper` argument `optimizing_iterations` to `linesearch_iterations` +- Optimizer `subsampling_step` accepts both absolute (int) and relative (float) fractions +- Objective `policy_gradient` argument `ratio_based` renamed to `importance_sampling` +- Added objectives `state_value` and `action_value` +- Added `Gaussian` distribution arguments `global_stddev` and `bounded_transform` (for improved bounded action space handling) +- Changed default memory `device` argument to `CPU:0` +- Renamed rewards summaries +- `Agent.create()` accepts act-function as `agent` argument for recording +- Singleton states and actions are now consistently handled as singletons +- Major change to policy handling and defaults, in particular `parametrized_distributions`, new default policies `parametrized_state/action_value` +- Combined `long` and `int` type +- Always wrap environment in `EnvironmentWrapper` class +- Changed `tune.py` arguments + + +--- + + +### Version 0.5.5 + +- Changed independent mode of `agent.act` to use final values of dynamic hyperparameters and avoid TensorFlow conditions +- Extended `"tensorflow"` format of `agent.save` to include an optimized Protobuf model with an act-only graph as `.pb` file, and `Agent.load` format `"pb-actonly"` to load act-only agent based on Protobuf model +- Support for custom summaries via new `summarizer` argument value `custom` to specify summary type, and `Agent.summarize(...)` to record summary values +- Added min/max-bounds for dynamic hyperparameters min/max-bounds to assert valid range and infer other arguments +- Argument `batch_size` now mandatory for all agent classes +- Removed `Estimator` argument `capacity`, now always automatically inferred +- Internal changes related to agent arguments `memory`, `update` and `reward_estimation` +- Changed the default `bias` and `activation` argument of some layers +- Fixed issues with `sequence` preprocessor +- DQN and dueling DQN properly constrained to `int` actions only +- Added `use_beta_distribution` argument with default `True` to many agents and `ParametrizedDistributions` policy, so default can be changed + + +--- + + +### Version 0.5.4 + +- DQN/DuelingDQN/DPG argument `memory` now required to be specified explicitly, plus `update_frequency` default changed +- Removed (temporarily) `conv1d/conv2d_transpose` layers due to TensorFlow gradient problems +- `Agent`, `Environment` and `Runner` can now be imported via `from tensorforce import ...` +- New generic reshape layer available as `reshape` +- Support for batched version of `Agent.act` and `Agent.observe` +- Support for parallelized remote environments based on Python's `multiprocessing` and `socket` (replacing `tensorforce/contrib/socket_remote_env/` and `tensorforce/environments/environment_process_wrapper.py`), available via `Environment.create(...)`, `Runner(...)` and `run.py` +- Removed `ParallelRunner` and merged functionality with `Runner` +- Changed `run.py` arguments +- Changed independent mode for `Agent.act`: additional argument `internals` and corresponding return value, initial internals via `Agent.initial_internals()`, `Agent.reset()` not required anymore +- Removed `deterministic` argument for `Agent.act` unless independent mode +- Added `format` argument to `save`/`load`/`restore` with supported formats `tensorflow`, `numpy` and `hdf5` +- Changed `save` argument `append_timestep` to `append` with default `None` (instead of `'timesteps'`) +- Added `get_variable` and `assign_variable` agent functions + + +--- + + +### Version 0.5.3 + +- Added optional `memory` argument to various agents +- Improved summary labels, particularly `"entropy"` and `"kl-divergence"` +- `linear` layer now accepts tensors of rank 1 to 3 +- Network output / distribution input does not need to be a vector anymore +- Transposed convolution layers (`conv1d/2d_transpose`) +- Parallel execution functionality contributed by @jerabaul29, currently under `tensorforce/contrib/` +- Accept string for runner `save_best_agent` argument to specify best model directory different from `saver` configuration +- `saver` argument `steps` removed and `seconds` renamed to `frequency` +- Moved `Parallel/Runner` argument `max_episode_timesteps` from `run(...)` to constructor +- New `Environment.create(...)` argument `max_episode_timesteps` +- TensorFlow 2.0 support +- Improved Tensorboard summaries recording +- Summary labels `graph`, `variables` and `variables-histogram` temporarily not working +- TF-optimizers updated to TensorFlow 2.0 Keras optimizers +- Added TensorFlow Addons dependency, and support for TFA optimizers +- Changed unit of `target_sync_frequency` from timesteps to updates for `dqn` and `dueling_dqn` agent + + +--- + + +### Version 0.5.2 + +- Improved unittest performance +- Added `updates` and renamed `timesteps`/`episodes` counter for agents and runners +- Renamed `critic_{network,optimizer}` argument to `baseline_{network,optimizer}` +- Added Actor-Critic (`ac`), Advantage Actor-Critic (`a2c`) and Dueling DQN (`dueling_dqn`) agents +- Improved "same" baseline optimizer mode and added optional weight specification +- Reuse layer now global for parameter sharing across modules +- New block layer type (`block`) for easier sharing of layer blocks +- Renamed `PolicyAgent/-Model` to `TensorforceAgent/-Model` +- New `Agent.load(...)` function, saving includes agent specification +- Removed `PolicyAgent` argument `(baseline-)network` +- Added policy argument `temperature` +- Removed `"same"` and `"equal"` options for `baseline_*` arguments and changed internal baseline handling +- Combined `state/action_value` to `value` objective with argument `value` either `"state"` or `"action"` + + +--- + + +### Version 0.5.1 + +- Fixed setup.py packages value + + +--- + + +### Version 0.5.0 + +##### Agent: + +- DQFDAgent removed (temporarily) +- DQNNstepAgent and NAFAgent part of DQNAgent +- Agents need to be initialized via `agent.initialize()` before application +- States/actions of type `int` require an entry `num_values` (instead of `num_actions`) +- `Agent.from_spec()` changed and renamed to `Agent.create()` +- `Agent.act()` argument `fetch_tensors` changed and renamed to `query`, `index` renamed to `parallel`, `buffered` removed +- `Agent.observe()` argument `index` renamed to `parallel` +- `Agent.atomic_observe()` removed +- `Agent.save/restore_model()` renamed to `Agent.save/restore()` + +##### Agent arguments: + +- `update_mode` renamed to `update` +- `states_preprocessing` and `reward_preprocessing` changed and combined to `preprocessing` +- `actions_exploration` changed and renamed to `exploration` +- `execution` entry `num_parallel` replaced by a separate argument `parallel_interactions` +- `batched_observe` and `batching_capacity` replaced by argument `buffer_observe` +- `scope` renamed to `name` + +##### DQNAgent arguments: + +- `update_mode` replaced by `batch_size`, `update_frequency` and `start_updating` +- `optimizer` removed, implicitly defined as `'adam'`, `learning_rate` added +- `memory` defines capacity of implicitly defined memory `'replay'` +- `double_q_model` removed (temporarily) + +##### Policy gradient agent arguments: + +- New mandatory argument `max_episode_timesteps` +- `update_mode` replaced by `batch_size` and `update_frequency` +- `memory` removed +- `baseline_mode` removed +- `baseline` argument changed and renamed to `critic_network` +- `baseline_optimizer` renamed to `critic_optimizer` +- `gae_lambda` removed (temporarily) + +##### PPOAgent arguments: + +- `step_optimizer` removed, implicitly defined as `'adam'`, `learning_rate` added + +##### TRPOAgent arguments: + +- `cg_*` and `ls_*` arguments removed + +##### VPGAgent arguments: + +- `optimizer` removed, implicitly defined as `'adam'`, `learning_rate` added + +##### Environment: + +- Environment properties `states` and `actions` are now functions `states()` and `actions()` +- States/actions of type `int` require an entry `num_values` (instead of `num_actions`) +- New function `Environment.max_episode_timesteps()` + +##### Contrib environments: + +- ALE, MazeExp, OpenSim, Gym, Retro, PyGame and ViZDoom moved to `tensorforce.environments` +- Other environment implementations removed (may be upgraded in the future) + +##### Runners: + +- Improved `run()` API for `Runner` and `ParallelRunner` +- `ThreadedRunner` removed + +##### Other: + +- `examples` folder (including `configs`) removed, apart from `quickstart.py` +- New `benchmarks` folder to replace parts of old `examples` folder diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..3bb6c9089 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,14 @@ +# Tensorforce Benchmarks + +Use the `run.py` script with the following arguments to produce benchmarks: + +```bash +python run.py --agent benchmarks/configs/ppo.json --environment gym --level CartPole-v1 \ + --episodes 100 --repeat 10 --path benchmarks/gym-cartpole/ppo +``` + +To run a full benchmark of a config in the `configs` subfolder, call the `benchmark.sh` bash script with the config name: + +```bash +benchmarks/benchmark.sh ppo +``` diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh new file mode 100644 index 000000000..48a3c35d7 --- /dev/null +++ b/benchmarks/benchmark.sh @@ -0,0 +1,25 @@ +if [ -z ${2+x} ]; then + + echo "=== Benchmarking $1 ===" + + echo "OpenAI Gym: classic control" + # echo "> Acrobot-v1" + # python run.py benchmarks/configs/$1.json gym -l Acrobot-v1 -e 300 -r 10 -p benchmarks/gym-acrobot/$1 + echo "> CartPole-v1" + python run.py --agent benchmarks/configs/$1.json --environment benchmarks/configs/cartpole.json --episodes 100 --repeat 10 --path benchmarks/gym-cartpole/$1 + # echo "> MountainCar-v0" + # python run.py benchmarks/configs/$1.json gym -l MountainCar-v0 -e 300 -r 10 -p benchmarks/gym-mountaincar/$1 + # echo "> MountainCarContinuous-v0" + # python run.py benchmarks/configs/$1.json gym -l MountainCarContinuous-v0 -e 300 -r 10 -p benchmarks/gym-mountaincar-continuous/$1 + # echo "> Pendulum-v0" + # python run.py benchmarks/configs/$1.json gym -l Pendulum-v0 -e 300 -r 10 -p benchmarks/gym-pendulum/$1 + + # echo "OpenAI Gym: Box2D" + # echo "> LunarLander-v2" + # python run.py benchmarks/configs/$1.json gym -l LunarLander-v2 -e 300 -r 10 -p benchmarks/gym-lunarlander/$1 + +else + + python run.py benchmarks/configs/$1.json benchmarks/configs/$2.json -e 300 -r 10 --path benchmarks/$2/$1 + +fi diff --git a/benchmarks/configs/cartpole.json b/benchmarks/configs/cartpole.json new file mode 100644 index 000000000..e77c4c719 --- /dev/null +++ b/benchmarks/configs/cartpole.json @@ -0,0 +1,7 @@ +{ + "environment": "gym", + "level": "CartPole", + "max_episode_timesteps": 500, + "min_value": -3.0, + "max_value": 3.0 +} diff --git a/benchmarks/configs/dqn_tensorforce.json b/benchmarks/configs/dqn_tensorforce.json new file mode 100644 index 000000000..3f0c9a21e --- /dev/null +++ b/benchmarks/configs/dqn_tensorforce.json @@ -0,0 +1,77 @@ +{ + "agent": "tensorforce", + "policy": { + "type": "parametrized_value_policy", + "network": { + "type": "auto", + "size": 64, + "depth": 2, + "final_size": null, + "final_depth": 1, + "rnn": false + }, + "single_output": true, + "state_value_mode": "implicit" + }, + "memory": { + "type": "replay", + "capacity": 10000, + "device": "CPU" + }, + "update": { + "unit": "timesteps", + "batch_size": 32, + "frequency": 0.25, + "start": null + }, + "optimizer": { + "optimizer": "adam", + "learning_rate": 0.001, + "gradient_norm_clipping": null, + "clipping_threshold": null, + "multi_step": 1, + "subsampling_fraction": 1.0, + "linesearch_iterations": 0, + "doublecheck_update": false + }, + "objective": { + "type": "action_value", + "huber_loss": null, + "early_reduce": true + }, + "reward_estimation": { + "horizon": 1, + "discount": 0.99, + "predict_horizon_values": "late", + "estimate_advantage": false, + "predict_action_values": false, + "reward_processing": null, + "return_processing": null, + "advantage_processing": null, + "predict_terminal_values": false + }, + "baseline": { + "type": "parametrized_value_policy", + "network": { + "type": "auto", + "size": 64, + "depth": 2, + "final_size": null, + "final_depth": 1, + "rnn": false + }, + "single_output": true, + "state_value_mode": "implicit" + }, + "baseline_optimizer": { + "type": "synchronization", + "update_weight": 1.0, + "sync_frequency": 1 + }, + "baseline_objective": null, + "l2_regularization": 0.0, + "entropy_regularization": 0.0, + "state_preprocessing": "linear_normalization", + "exploration": 0.0, + "variable_noise": 0.0 +} diff --git a/benchmarks/configs/ppo.json b/benchmarks/configs/ppo.json new file mode 100644 index 000000000..a34fef6dd --- /dev/null +++ b/benchmarks/configs/ppo.json @@ -0,0 +1,24 @@ +{ + "agent": "ppo", + "network": {"type": "auto", "rnn": false}, + "use_beta_distribution": false, + "memory": "minimum", + "batch_size": 12, + "update_frequency": 1, + "learning_rate": 0.001813150053725916, + "multi_step": 5, + "subsampling_fraction": 0.9131375430837279, + "likelihood_ratio_clipping": 0.09955676846552193, + "discount": 0.9985351346308641, + "return_processing": null, + "advantage_processing": null, + "predict_terminal_values": false, + "reward_processing": null, + "baseline": {"type": "auto", "rnn": false}, + "baseline_optimizer": {"optimizer": "adam", "learning_rate": 0.003670157218888348, "multi_step": 10}, + "l2_regularization": 0.0, + "entropy_regularization": 0.0011393096635237982, + "state_preprocessing": "linear_normalization", + "exploration": 0.0, + "variable_noise": 0.0 +} diff --git a/benchmarks/configs/ppo_tensorforce.json b/benchmarks/configs/ppo_tensorforce.json new file mode 100644 index 000000000..dd728e383 --- /dev/null +++ b/benchmarks/configs/ppo_tensorforce.json @@ -0,0 +1,100 @@ +{ + "agent": "tensorforce", + "policy": { + "type": "parametrized_distributions", + "network": { + "type": "auto", + "size": 64, + "depth": 2, + "final_size": null, + "final_depth": 1, + "rnn": false + }, + "single_output": true, + "distributions": { + "bool": { + "type": "bernoulli" + }, + "int": { + "type": "categorical", + "temperature_mode": null + }, + "float": { + "type": "gaussian", + "stddev_mode": "predicted", + "bounded_transform": "tanh" + } + }, + "temperature": 1.0, + "use_beta_distribution": false + }, + "memory": { + "type": "recent", + "capacity": null, + "device": "CPU" + }, + "update": { + "unit": "episodes", + "batch_size": 12, + "frequency": 1, + "start": null + }, + "optimizer": { + "optimizer": "adam", + "learning_rate": 0.001813150053725916, + "gradient_norm_clipping": null, + "clipping_threshold": null, + "multi_step": 5, + "subsampling_fraction": 0.9131375430837279, + "linesearch_iterations": 0, + "doublecheck_update": false + }, + "objective": { + "type": "policy_gradient", + "importance_sampling": true, + "clipping_value": 0.09955676846552193, + "early_reduce": true + }, + "reward_estimation": { + "horizon": "episode", + "discount": 0.9985351346308641, + "predict_horizon_values": "early", + "estimate_advantage": true, + "predict_action_values": false, + "reward_processing": null, + "return_processing": null, + "advantage_processing": null, + "predict_terminal_values": false + }, + "baseline": { + "type": "parametrized_state_value", + "network": { + "type": "auto", + "size": 64, + "depth": 2, + "final_size": null, + "final_depth": 1, + "rnn": false + } + }, + "baseline_optimizer": { + "optimizer": "adam", + "learning_rate": 0.003670157218888348, + "gradient_norm_clipping": null, + "clipping_threshold": null, + "multi_step": 10, + "subsampling_fraction": 1.0, + "linesearch_iterations": 0, + "doublecheck_update": false + }, + "baseline_objective": { + "type": "state_value", + "huber_loss": null, + "early_reduce": true + }, + "l2_regularization": 0.0, + "entropy_regularization": 0.0011393096635237982, + "state_preprocessing": "linear_normalization", + "exploration": 0.0, + "variable_noise": 0.0 +} diff --git a/benchmarks/gym-cartpole/ppo.json b/benchmarks/gym-cartpole/ppo.json new file mode 100644 index 000000000..6d163f553 --- /dev/null +++ b/benchmarks/gym-cartpole/ppo.json @@ -0,0 +1 @@ +{"rewards": [[32.0, 11.0, 57.0, 15.0, 44.0, 86.0, 15.0, 16.0, 19.0, 15.0], [18.0, 56.0, 24.0, 23.0, 14.0, 9.0, 35.0, 11.0, 35.0, 39.0], [15.0, 11.0, 42.0, 15.0, 12.0, 22.0, 12.0, 13.0, 13.0, 11.0], [38.0, 10.0, 18.0, 14.0, 55.0, 24.0, 11.0, 25.0, 15.0, 10.0], [15.0, 14.0, 46.0, 17.0, 14.0, 22.0, 33.0, 19.0, 58.0, 47.0], [10.0, 103.0, 50.0, 16.0, 47.0, 113.0, 23.0, 45.0, 19.0, 19.0], [78.0, 20.0, 157.0, 15.0, 30.0, 80.0, 209.0, 16.0, 21.0, 11.0], [103.0, 29.0, 169.0, 85.0, 60.0, 138.0, 102.0, 47.0, 42.0, 95.0], [250.0, 55.0, 148.0, 67.0, 108.0, 94.0, 240.0, 69.0, 31.0, 24.0], [185.0, 210.0, 331.0, 129.0, 138.0, 147.0, 149.0, 140.0, 92.0, 16.0], [290.0, 137.0, 358.0, 103.0, 92.0, 125.0, 500.0, 264.0, 258.0, 208.0], [203.0, 216.0, 196.0, 228.0, 161.0, 142.0, 497.0, 166.0, 144.0, 202.0], [170.0, 174.0, 255.0, 386.0, 141.0, 165.0, 500.0, 296.0, 221.0, 207.0], [214.0, 500.0, 369.0, 500.0, 154.0, 149.0, 474.0, 216.0, 331.0, 271.0], [440.0, 324.0, 231.0, 238.0, 157.0, 131.0, 33.0, 279.0, 263.0, 303.0], [256.0, 500.0, 296.0, 500.0, 165.0, 160.0, 500.0, 500.0, 282.0, 196.0], [210.0, 500.0, 282.0, 500.0, 162.0, 159.0, 371.0, 355.0, 273.0, 248.0], [310.0, 500.0, 385.0, 500.0, 149.0, 145.0, 500.0, 368.0, 251.0, 255.0], [500.0, 500.0, 272.0, 500.0, 158.0, 143.0, 500.0, 500.0, 243.0, 235.0], [500.0, 500.0, 315.0, 500.0, 156.0, 148.0, 268.0, 500.0, 241.0, 287.0], [500.0, 500.0, 282.0, 500.0, 165.0, 151.0, 500.0, 500.0, 256.0, 302.0], [500.0, 500.0, 379.0, 500.0, 160.0, 155.0, 500.0, 500.0, 327.0, 385.0], [500.0, 500.0, 500.0, 500.0, 158.0, 163.0, 358.0, 500.0, 500.0, 371.0], [500.0, 500.0, 500.0, 500.0, 186.0, 147.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 471.0, 500.0, 160.0, 138.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 176.0, 160.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 191.0, 176.0, 393.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 185.0, 160.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 197.0, 150.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 191.0, 175.0, 453.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 231.0, 186.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 225.0, 173.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 230.0, 173.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 190.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 256.0, 193.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 210.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 209.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 217.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 202.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 237.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 250.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 270.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 273.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 299.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 171.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 434.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 385.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0], [500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0]], "timesteps": [[32, 11, 57, 15, 44, 86, 15, 16, 19, 15], [18, 56, 24, 23, 14, 9, 35, 11, 35, 39], [15, 11, 42, 15, 12, 22, 12, 13, 13, 11], [38, 10, 18, 14, 55, 24, 11, 25, 15, 10], [15, 14, 46, 17, 14, 22, 33, 19, 58, 47], [10, 103, 50, 16, 47, 113, 23, 45, 19, 19], [78, 20, 157, 15, 30, 80, 209, 16, 21, 11], [103, 29, 169, 85, 60, 138, 102, 47, 42, 95], [250, 55, 148, 67, 108, 94, 240, 69, 31, 24], [185, 210, 331, 129, 138, 147, 149, 140, 92, 16], [290, 137, 358, 103, 92, 125, 500, 264, 258, 208], [203, 216, 196, 228, 161, 142, 497, 166, 144, 202], [170, 174, 255, 386, 141, 165, 500, 296, 221, 207], [214, 500, 369, 500, 154, 149, 474, 216, 331, 271], [440, 324, 231, 238, 157, 131, 33, 279, 263, 303], [256, 500, 296, 500, 165, 160, 500, 500, 282, 196], [210, 500, 282, 500, 162, 159, 371, 355, 273, 248], [310, 500, 385, 500, 149, 145, 500, 368, 251, 255], [500, 500, 272, 500, 158, 143, 500, 500, 243, 235], [500, 500, 315, 500, 156, 148, 268, 500, 241, 287], [500, 500, 282, 500, 165, 151, 500, 500, 256, 302], [500, 500, 379, 500, 160, 155, 500, 500, 327, 385], [500, 500, 500, 500, 158, 163, 358, 500, 500, 371], [500, 500, 500, 500, 186, 147, 500, 500, 500, 500], [500, 500, 471, 500, 160, 138, 500, 500, 500, 500], [500, 500, 500, 500, 176, 160, 500, 500, 500, 500], [500, 500, 500, 500, 191, 176, 393, 500, 500, 500], [500, 500, 500, 500, 185, 160, 500, 500, 500, 500], [500, 500, 500, 500, 197, 150, 500, 500, 500, 500], [500, 500, 500, 500, 191, 175, 453, 500, 500, 500], [500, 500, 500, 500, 231, 186, 500, 500, 500, 500], [500, 500, 500, 500, 225, 173, 500, 500, 500, 500], [500, 500, 500, 500, 230, 173, 500, 500, 500, 500], [500, 500, 500, 500, 500, 190, 500, 500, 500, 500], [500, 500, 500, 500, 256, 193, 500, 500, 500, 500], [500, 500, 500, 500, 500, 210, 500, 500, 500, 500], [500, 500, 500, 500, 500, 209, 500, 500, 500, 500], [500, 500, 500, 500, 500, 217, 500, 500, 500, 500], [500, 500, 500, 500, 500, 202, 500, 500, 500, 500], [500, 500, 500, 500, 500, 237, 500, 500, 500, 500], [500, 500, 500, 500, 500, 250, 500, 500, 500, 500], [500, 500, 500, 500, 500, 270, 500, 500, 500, 500], [500, 500, 500, 500, 500, 273, 500, 500, 500, 500], [500, 500, 500, 500, 500, 299, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 171, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 434, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 385, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500], [500, 500, 500, 500, 500, 500, 500, 500, 500, 500]], "seconds": [[0.17463970184326172, 0.04439902305603027, 0.15268349647521973, 0.052193403244018555, 0.12912607192993164, 0.26495361328125, 0.06385040283203125, 0.0655982494354248, 0.08155059814453125, 0.058976173400878906], [0.07072663307189941, 0.14246273040771484, 0.07554435729980469, 0.06851649284362793, 0.05865359306335449, 0.05245232582092285, 0.12143278121948242, 0.054824113845825195, 0.1268312931060791, 0.1322648525238037], [0.05544924736022949, 0.04325675964355469, 0.11979246139526367, 0.05157136917114258, 0.05224132537841797, 0.09038805961608887, 0.057669878005981445, 0.08629202842712402, 0.06148171424865723, 0.05678510665893555], [0.11873674392700195, 0.04248857498168945, 0.0593419075012207, 0.05370306968688965, 0.18289828300476074, 0.09828042984008789, 0.05628395080566406, 0.13979005813598633, 0.06595897674560547, 0.05010700225830078], [0.06361532211303711, 0.05034470558166504, 0.1282813549041748, 0.05811262130737305, 0.06117129325866699, 0.08964109420776367, 0.12223649024963379, 0.1185920238494873, 0.1871497631072998, 0.16295361518859863], [0.04349207878112793, 0.2525665760040283, 0.13256502151489258, 0.05278897285461426, 0.14914846420288086, 0.36553072929382324, 0.09462428092956543, 0.24608707427978516, 0.07935476303100586, 0.07907962799072266], [0.24827194213867188, 0.06612038612365723, 0.37429070472717285, 0.052565574645996094, 0.10418081283569336, 0.2689402103424072, 0.6395711898803711, 0.110382080078125, 0.0843806266784668, 0.055245399475097656], [0.29978156089782715, 0.09000015258789062, 0.40712928771972656, 0.20465493202209473, 0.18639564514160156, 0.4318420886993408, 0.3316497802734375, 0.22648954391479492, 0.15036940574645996, 0.3505828380584717], [0.8176186084747314, 0.1440272331237793, 0.3627157211303711, 0.17064356803894043, 0.31166863441467285, 0.3628242015838623, 0.7470476627349854, 0.24938607215881348, 0.11642050743103027, 0.1479346752166748], [0.5921664237976074, 0.49475765228271484, 0.7760989665985107, 0.4046344757080078, 0.3940725326538086, 0.5857071876525879, 0.48522305488586426, 0.42809247970581055, 0.2837982177734375, 0.12626338005065918], [0.9032919406890869, 0.33767271041870117, 0.8371288776397705, 0.260753870010376, 0.27144622802734375, 0.42220616340637207, 1.671825647354126, 0.7923629283905029, 0.7444021701812744, 0.7768411636352539], [0.6218767166137695, 0.508418083190918, 0.5706582069396973, 0.5423567295074463, 0.6096222400665283, 0.45639634132385254, 1.5412206649780273, 0.5161628723144531, 0.4332706928253174, 0.5889840126037598], [0.5441784858703613, 0.4223480224609375, 0.6230626106262207, 0.9059205055236816, 0.4185919761657715, 0.530447244644165, 1.5426902770996094, 0.8935301303863525, 0.6638424396514893, 0.6026480197906494], [0.6897387504577637, 1.1579651832580566, 0.8701198101043701, 1.1492912769317627, 0.44709277153015137, 0.4841184616088867, 1.5781629085540771, 0.6798050403594971, 0.9639689922332764, 0.7700605392456055], [1.519559383392334, 0.795208215713501, 0.5585196018218994, 0.5755276679992676, 0.45711207389831543, 0.43386006355285645, 0.30799436569213867, 1.0211968421936035, 0.9055263996124268, 0.8677325248718262], [0.8506667613983154, 1.220198392868042, 0.7163333892822266, 1.1614179611206055, 0.47583723068237305, 0.5189855098724365, 1.6612608432769775, 1.492305040359497, 1.0367918014526367, 0.6018962860107422], [0.6935958862304688, 1.16495680809021, 0.6889705657958984, 1.2599575519561768, 0.478985071182251, 0.5227930545806885, 1.189150094985962, 1.106351613998413, 0.7921931743621826, 0.9835479259490967], [0.9723513126373291, 1.1755943298339844, 0.9259264469146729, 1.1715397834777832, 0.4572741985321045, 0.47899460792541504, 1.5905356407165527, 1.148782730102539, 0.7583062648773193, 0.8321597576141357], [1.5246856212615967, 1.2033851146697998, 0.7643074989318848, 1.1912274360656738, 0.4766261577606201, 0.5144851207733154, 1.8858368396759033, 1.7081472873687744, 0.7475206851959229, 0.7052145004272461], [1.639861822128296, 1.3283312320709229, 0.7741754055023193, 1.2014439105987549, 0.4696979522705078, 0.630216121673584, 0.9306139945983887, 1.5477876663208008, 0.8049414157867432, 0.8613035678863525], [1.6054580211639404, 1.2135035991668701, 0.6996381282806396, 1.2982254028320312, 0.5011873245239258, 0.5154337882995605, 1.5919177532196045, 1.5544240474700928, 1.00654935836792, 0.9004759788513184], [1.5117909908294678, 1.211768388748169, 0.9107065200805664, 1.2135779857635498, 0.5563464164733887, 0.5055887699127197, 1.764960527420044, 1.7265360355377197, 1.118666648864746, 1.1727921962738037], [1.3013949394226074, 1.237795114517212, 1.1814606189727783, 1.2456886768341064, 0.5708761215209961, 0.5511958599090576, 1.1880578994750977, 1.6110923290252686, 1.4407384395599365, 1.3857264518737793], [1.2121386528015137, 1.3246371746063232, 1.2518203258514404, 1.2158870697021484, 0.564417839050293, 0.48673152923583984, 1.6210200786590576, 1.5488998889923096, 1.4860997200012207, 1.4991226196289062], [1.2088992595672607, 1.2247719764709473, 1.1373655796051025, 1.318068504333496, 0.49095606803894043, 0.4694066047668457, 1.7596821784973145, 1.7319974899291992, 1.796600580215454, 1.483921766281128], [1.2204093933105469, 1.2212646007537842, 1.1934010982513428, 1.2362334728240967, 0.5467305183410645, 0.526832103729248, 1.585075855255127, 1.5724422931671143, 1.4597878456115723, 1.839839220046997], [1.308258056640625, 1.2398431301116943, 1.2089817523956299, 1.2301454544067383, 0.57187819480896, 0.574150562286377, 1.313302993774414, 1.5704333782196045, 1.4844586849212646, 1.5082905292510986], [1.210585355758667, 1.3325486183166504, 1.2269067764282227, 1.2300820350646973, 0.5653653144836426, 0.5285632610321045, 1.7830989360809326, 1.7049272060394287, 1.6682102680206299, 1.4867982864379883], [1.2187142372131348, 1.2405297756195068, 1.3054945468902588, 1.3146822452545166, 0.5946910381317139, 0.6665694713592529, 1.7897443771362305, 1.5404913425445557, 1.5114212036132812, 1.8771162033081055], [1.223639965057373, 1.2303977012634277, 1.218837022781372, 1.2254838943481445, 0.5830938816070557, 0.5876333713531494, 1.4915366172790527, 1.5487563610076904, 1.4936952590942383, 1.545154333114624], [1.308436632156372, 1.2399165630340576, 1.2161054611206055, 1.2459402084350586, 0.8368299007415771, 0.6025996208190918, 1.7667787075042725, 1.6507866382598877, 1.6747586727142334, 1.5222535133361816], [1.2418570518493652, 1.3138177394866943, 1.231269121170044, 1.2406129837036133, 0.6757161617279053, 0.5789778232574463, 1.626528024673462, 1.789754867553711, 1.5193440914154053, 1.8458960056304932], [1.2392280101776123, 1.2228009700775146, 1.3110432624816895, 1.3317677974700928, 0.6940932273864746, 0.5717735290527344, 1.6182410717010498, 1.5602333545684814, 1.5505197048187256, 1.5304198265075684], [1.2477126121520996, 1.2368571758270264, 1.236649751663208, 1.2288932800292969, 1.3952069282531738, 0.6118524074554443, 1.7848718166351318, 1.6687703132629395, 1.639343500137329, 3.3620686531066895], [1.3171565532684326, 1.222970724105835, 1.2438700199127197, 1.2368097305297852, 0.7733826637268066, 0.6248531341552734, 1.626847743988037, 1.8046879768371582, 1.778320074081421, 3.376312732696533], [1.227942943572998, 1.3362255096435547, 1.2363719940185547, 1.2369482517242432, 1.562479019165039, 0.6736929416656494, 1.6240878105163574, 1.5686893463134766, 1.5488026142120361, 1.8955330848693848], [1.2166650295257568, 1.2304656505584717, 1.3261253833770752, 1.3298399448394775, 1.4264068603515625, 0.8462769985198975, 1.809342861175537, 1.7356529235839844, 1.6377573013305664, 1.5544986724853516], [1.2392160892486572, 1.233821153640747, 1.2255887985229492, 1.258720874786377, 1.4579803943634033, 0.7107527256011963, 1.6386401653289795, 1.7298479080200195, 1.766554832458496, 1.5499987602233887], [1.3190176486968994, 1.238365888595581, 1.265310287475586, 1.3088428974151611, 1.4694678783416748, 0.6570024490356445, 1.6423892974853516, 1.5717921257019043, 1.5367095470428467, 1.8759350776672363], [1.2223529815673828, 1.3202555179595947, 1.2725105285644531, 1.262108564376831, 1.645561933517456, 0.8297841548919678, 2.081364154815674, 1.6898796558380127, 1.5778553485870361, 1.5706658363342285], [1.2401022911071777, 1.229118824005127, 1.301494836807251, 1.4214940071105957, 1.5190529823303223, 0.8299169540405273, 1.7147581577301025, 1.7601902484893799, 1.8208458423614502, 1.5591039657592773], [1.2222650051116943, 1.2644059658050537, 1.2269275188446045, 1.2885007858276367, 1.5077581405639648, 0.8628482818603516, 1.6546483039855957, 1.5554816722869873, 1.5362420082092285, 1.8884456157684326], [1.3205645084381104, 1.2252802848815918, 1.2369353771209717, 1.3069403171539307, 1.6589369773864746, 1.0359160900115967, 1.7996068000793457, 1.679236888885498, 1.602426290512085, 1.5741400718688965], [1.1994614601135254, 1.3303086757659912, 1.2731742858886719, 1.3990545272827148, 1.531872034072876, 0.9552991390228271, 1.6256258487701416, 1.716813564300537, 1.8401734828948975, 1.5526094436645508], [1.3281822204589844, 1.25514817237854, 1.2917895317077637, 1.3031587600708008, 1.5681705474853516, 1.5188572406768799, 1.6196696758270264, 1.5620970726013184, 1.5478715896606445, 1.9059560298919678], [1.2215290069580078, 1.2244372367858887, 1.2671582698822021, 1.2882394790649414, 1.7288589477539062, 1.5430090427398682, 1.800513744354248, 1.6500494480133057, 1.568610429763794, 1.5648064613342285], [1.3358755111694336, 1.2622041702270508, 1.246058464050293, 1.2967653274536133, 1.5912744998931885, 1.726078987121582, 1.6367764472961426, 1.746659278869629, 1.8272662162780762, 1.549253225326538], [1.2195401191711426, 1.3167524337768555, 1.3016550540924072, 1.4215548038482666, 1.5894007682800293, 1.583570957183838, 1.6320970058441162, 1.5496602058410645, 1.5331547260284424, 1.8906896114349365], [1.2449288368225098, 1.2389953136444092, 1.2564880847930908, 1.29591703414917, 1.7683098316192627, 1.6035678386688232, 1.78947114944458, 1.6273765563964844, 1.544759750366211, 1.5550312995910645], [1.2358336448669434, 1.2427177429199219, 1.2252366542816162, 1.296644687652588, 1.590799331665039, 1.7598061561584473, 1.622875452041626, 1.8089580535888672, 1.8690569400787354, 1.533860445022583], [1.3180053234100342, 1.269782543182373, 1.2247114181518555, 1.287109613418579, 1.6120448112487793, 1.6210641860961914, 1.643498182296753, 1.5804414749145508, 1.5408785343170166, 1.922868251800537], [1.2214205265045166, 1.3116092681884766, 1.3326201438903809, 1.4460234642028809, 1.8446130752563477, 1.61722731590271, 1.8199784755706787, 1.6425185203552246, 1.5804319381713867, 1.5402987003326416], [1.2336554527282715, 1.2431960105895996, 1.2394096851348877, 1.3321456909179688, 1.6591265201568604, 1.785435438156128, 1.6428422927856445, 1.8072431087493896, 1.8508131504058838, 1.5522356033325195], [1.2441794872283936, 1.2455458641052246, 1.232602596282959, 1.301365613937378, 1.6170661449432373, 1.632427453994751, 1.6338603496551514, 1.5599610805511475, 1.537806749343872, 1.900785207748413], [1.3359441757202148, 1.4089109897613525, 1.2309677600860596, 1.3211662769317627, 1.7990188598632812, 1.6500811576843262, 1.79608154296875, 1.6409742832183838, 1.5598468780517578, 1.5507655143737793], [1.225773811340332, 1.2450973987579346, 1.3600749969482422, 1.4233553409576416, 1.6725738048553467, 1.8195223808288574, 1.6615419387817383, 1.7858755588531494, 1.8659777641296387, 1.5234153270721436], [1.215599775314331, 1.23952054977417, 1.2390992641448975, 1.2960443496704102, 1.6930241584777832, 1.650467872619629, 1.646226167678833, 1.5473289489746094, 1.5274834632873535, 1.9205682277679443], [1.2409749031066895, 1.259613275527954, 1.241806983947754, 1.3369512557983398, 2.1880953311920166, 1.6475756168365479, 2.8756983280181885, 1.6400763988494873, 1.5581469535827637, 1.522672414779663], [1.3346292972564697, 1.3691837787628174, 1.2678313255310059, 1.4323086738586426, 1.6186552047729492, 1.8184313774108887, 1.7469921112060547, 1.8190953731536865, 1.8880279064178467, 1.539003849029541], [1.22434663772583, 1.245241403579712, 1.3225202560424805, 1.309234619140625, 1.651801347732544, 1.6442134380340576, 1.8217382431030273, 1.5362391471862793, 1.547257423400879, 1.9000952243804932], [1.2159364223480225, 1.2531449794769287, 1.2430689334869385, 1.3197979927062988, 1.823113203048706, 1.6171367168426514, 1.6517679691314697, 1.6307153701782227, 1.553359031677246, 1.5896830558776855], [1.2152645587921143, 1.2308001518249512, 1.2146937847137451, 1.317331075668335, 1.63690185546875, 1.820033311843872, 1.6653847694396973, 1.8254411220550537, 1.8448457717895508, 1.5821356773376465], [1.3050110340118408, 1.3138387203216553, 1.2224843502044678, 1.4612681865692139, 1.6966056823730469, 1.6729185581207275, 1.8047668933868408, 1.5417656898498535, 1.551621437072754, 1.9022471904754639], [1.2208423614501953, 1.2377173900604248, 1.3259782791137695, 1.3183932304382324, 1.9621903896331787, 1.6337933540344238, 1.6176910400390625, 1.6480891704559326, 1.5303776264190674, 1.5476889610290527], [1.2289226055145264, 1.2236557006835938, 1.237837314605713, 1.33687424659729, 1.6493136882781982, 1.822122573852539, 1.654390811920166, 1.817326545715332, 1.860685110092163, 1.5273480415344238], [1.2529206275939941, 1.2395200729370117, 1.2414679527282715, 1.3195738792419434, 1.7783458232879639, 1.6114706993103027, 1.8165373802185059, 1.5388658046722412, 1.5521063804626465, 1.8899328708648682], [1.3616547584533691, 1.3386034965515137, 1.227203607559204, 1.4402720928192139, 1.6348586082458496, 1.6043274402618408, 1.6363513469696045, 1.6315374374389648, 1.54010009765625, 1.5362756252288818], [1.230715036392212, 1.2554280757904053, 1.3175687789916992, 1.345771074295044, 1.6391785144805908, 1.8114233016967773, 0.7237024307250977, 1.8165075778961182, 1.8448760509490967, 1.53389573097229], [1.1989028453826904, 1.2445533275604248, 1.2509064674377441, 1.3404335975646973, 1.8000884056091309, 1.6345314979553223, 1.7625198364257812, 1.524791955947876, 1.532926082611084, 1.8802237510681152], [1.2062430381774902, 1.2351529598236084, 1.2292733192443848, 1.4948773384094238, 1.633047342300415, 1.6633296012878418, 1.6312963962554932, 1.6284797191619873, 1.5442843437194824, 1.5526282787322998], [1.3659369945526123, 1.3345987796783447, 1.2270233631134033, 1.3355059623718262, 1.6830363273620605, 1.9783344268798828, 1.6167340278625488, 1.8234021663665771, 1.8604786396026611, 1.5651969909667969], [1.2172493934631348, 1.2313547134399414, 1.312706470489502, 1.3424293994903564, 1.8081986904144287, 1.619136095046997, 1.7680690288543701, 1.5639557838439941, 1.549670934677124, 1.8975634574890137], [1.2233202457427979, 1.2154035568237305, 1.2443315982818604, 1.3464374542236328, 1.63230299949646, 1.7795696258544922, 1.6223118305206299, 1.6713812351226807, 1.55574369430542, 1.5297064781188965], [1.2437846660614014, 1.2391600608825684, 1.2214815616607666, 1.498891830444336, 1.628321886062622, 1.8307430744171143, 1.6294572353363037, 1.7835543155670166, 1.8772804737091064, 1.5280392169952393], [1.3177106380462646, 1.336141586303711, 1.236485481262207, 1.3528196811676025, 1.79091477394104, 1.6174817085266113, 1.782729148864746, 1.5345566272735596, 1.3641486167907715, 1.9014697074890137], [1.2250609397888184, 1.2348318099975586, 1.306976318359375, 1.347343921661377, 1.643563985824585, 1.7942259311676025, 1.7113187313079834, 1.6547126770019531, 1.5420572757720947, 1.5288403034210205], [1.225719690322876, 1.2346749305725098, 1.2241370677947998, 1.3933100700378418, 1.6193220615386963, 1.5963072776794434, 1.6221961975097656, 1.7908403873443604, 1.87638258934021, 1.5598602294921875], [1.2266457080841064, 1.2343173027038574, 1.2378857135772705, 1.4803121089935303, 1.831540584564209, 1.633272647857666, 1.76230788230896, 1.5326473712921143, 1.5316290855407715, 1.864076852798462], [1.3081042766571045, 1.3180203437805176, 1.2333390712738037, 1.3669054508209229, 1.6375267505645752, 1.8089017868041992, 1.6235878467559814, 1.6288058757781982, 1.53999662399292, 1.5286283493041992], [1.2471728324890137, 1.2388195991516113, 1.3205628395080566, 1.364194631576538, 1.6561975479125977, 1.6211609840393066, 2.048556089401245, 1.8038437366485596, 1.8714673519134521, 1.5486125946044922], [1.2227990627288818, 1.2651283740997314, 1.237990140914917, 1.5052340030670166, 1.836561679840088, 1.635915756225586, 2.3433315753936768, 1.5253596305847168, 1.55670166015625, 1.8930749893188477], [1.2211089134216309, 1.2485506534576416, 1.2387537956237793, 1.3674037456512451, 1.6315405368804932, 1.7859861850738525, 1.941807746887207, 1.6086828708648682, 1.5394082069396973, 1.5439326763153076], [1.3454713821411133, 1.3218472003936768, 1.230278730392456, 1.3584692478179932, 1.6295619010925293, 1.62211275100708, 1.8131904602050781, 1.8227524757385254, 1.8618037700653076, 1.5227999687194824], [1.2344729900360107, 1.228734016418457, 1.3113822937011719, 1.371335506439209, 1.8332240581512451, 1.6511247158050537, 1.818634033203125, 1.5459694862365723, 1.528123140335083, 1.8671393394470215], [1.2259118556976318, 1.226994276046753, 1.237070083618164, 1.5373649597167969, 1.6308038234710693, 1.8106904029846191, 1.6437103748321533, 1.5970535278320312, 1.545576810836792, 1.5185725688934326], [1.2370707988739014, 1.238590955734253, 1.2350373268127441, 1.4054038524627686, 1.669980764389038, 1.6640455722808838, 1.7903296947479248, 1.8490557670593262, 1.8711307048797607, 1.5206162929534912], [1.3312358856201172, 1.3329992294311523, 1.2246735095977783, 1.4114711284637451, 1.79213285446167, 1.6194860935211182, 1.6456067562103271, 1.5315752029418945, 1.5235376358032227, 1.8770644664764404], [1.2128772735595703, 1.2324645519256592, 1.3077731132507324, 1.550919771194458, 1.6204869747161865, 1.7915618419647217, 1.6513233184814453, 1.5827410221099854, 1.5542807579040527, 1.532090663909912], [1.2346546649932861, 1.244570255279541, 1.2469244003295898, 1.4039928913116455, 5.0784759521484375, 1.6191041469573975, 1.7922148704528809, 1.8468530178070068, 1.8889687061309814, 1.5293424129486084], [1.218634843826294, 1.2289364337921143, 1.2157282829284668, 1.3746767044067383, 2.1434457302093506, 1.3063876628875732, 1.6221492290496826, 1.537971019744873, 1.552734613418579, 1.874171495437622], [1.3211638927459717, 1.332700490951538, 1.2168614864349365, 1.4024531841278076, 1.9848113059997559, 1.777515172958374, 1.763115406036377, 1.573150634765625, 1.5725862979888916, 1.5361013412475586], [1.2141468524932861, 1.22324800491333, 1.3265178203582764, 1.5715975761413574, 1.6209511756896973, 1.593766212463379, 1.82718825340271, 1.8205771446228027, 1.8868257999420166, 1.5524091720581055], [1.2419605255126953, 1.2298979759216309, 1.2306697368621826, 1.4256727695465088, 1.7968800067901611, 1.6164329051971436, 1.640782356262207, 1.539551019668579, 1.538414716720581, 1.861530065536499], [1.2406933307647705, 1.2272605895996094, 1.2309293746948242, 1.4209222793579102, 1.8467657566070557, 1.7796437740325928, 1.6585311889648438, 1.5471758842468262, 1.5643892288208008, 1.541152000427246], [1.328275203704834, 1.3191428184509277, 1.2286021709442139, 1.5859179496765137, 1.618990421295166, 1.6163110733032227, 1.814232349395752, 1.8690125942230225, 1.8790299892425537, 1.564824104309082], [1.213759183883667, 1.2347650527954102, 1.3055758476257324, 1.4349615573883057, 1.8387730121612549, 1.6122913360595703, 1.7050433158874512, 1.5477924346923828, 1.54878830909729, 1.8726692199707031], [1.2505700588226318, 1.2201554775238037, 1.2273430824279785, 1.4211032390594482, 1.641408920288086, 1.7879035472869873, 1.860262155532837, 1.5312905311584473, 1.5438330173492432, 1.5541818141937256], [1.2374887466430664, 1.2508373260498047, 1.2238380908966064, 1.5550258159637451, 1.638118028640747, 1.608349323272705, 1.7398478984832764, 1.8706612586975098, 1.8654706478118896, 1.5263152122497559], [1.3068757057189941, 1.3205785751342773, 1.2227208614349365, 1.4819703102111816, 1.7843503952026367, 1.6288440227508545, 1.6008954048156738, 1.5738894939422607, 1.5324695110321045, 1.8423852920532227], [1.2092816829681396, 1.2219853401184082, 1.3290164470672607, 1.4430384635925293, 1.6241998672485352, 1.7683186531066895, 1.5873332023620605, 1.5467917919158936, 1.5337672233581543, 1.5770843029022217]], "agent_seconds": [[0.16482234001159668, 0.041387319564819336, 0.14253473281860352, 0.04873466491699219, 0.12074708938598633, 0.24800562858581543, 0.05980229377746582, 0.06144857406616211, 0.07614874839782715, 0.05512189865112305], [0.0668489933013916, 0.134199857711792, 0.07145547866821289, 0.0649712085723877, 0.05599665641784668, 0.050477027893066406, 0.11503171920776367, 0.052533626556396484, 0.12013411521911621, 0.12478494644165039], [0.05263805389404297, 0.0413970947265625, 0.11304688453674316, 0.04912924766540527, 0.05003523826599121, 0.08594489097595215, 0.05517220497131348, 0.0821220874786377, 0.05859947204589844, 0.05423998832702637], [0.11166763305664062, 0.040651559829711914, 0.056371450424194336, 0.05112028121948242, 0.1722853183746338, 0.09331536293029785, 0.05398154258728027, 0.13172411918640137, 0.06290864944458008, 0.048043012619018555], [0.06037259101867676, 0.0480341911315918, 0.1207430362701416, 0.05523276329040527, 0.05838370323181152, 0.08524703979492188, 0.11582446098327637, 0.11193108558654785, 0.17630863189697266, 0.15389704704284668], [0.041565656661987305, 0.23712849617004395, 0.12484598159790039, 0.050295352935791016, 0.140916109085083, 0.34344053268432617, 0.08997344970703125, 0.23113131523132324, 0.07551383972167969, 0.07532763481140137], [0.2322859764099121, 0.06289172172546387, 0.3513610363006592, 0.05004310607910156, 0.09869718551635742, 0.25333523750305176, 0.6004691123962402, 0.1049957275390625, 0.08015918731689453, 0.05298900604248047], [0.27959728240966797, 0.08513569831848145, 0.3823873996734619, 0.19214820861816406, 0.17592287063598633, 0.40602803230285645, 0.3124701976776123, 0.21295571327209473, 0.14203763008117676, 0.329056978225708], [0.7632005214691162, 0.13564419746398926, 0.3407471179962158, 0.16055893898010254, 0.29320430755615234, 0.3417503833770752, 0.7017745971679688, 0.2347888946533203, 0.1102743148803711, 0.14008259773254395], [0.5521841049194336, 0.46337056159973145, 0.7267887592315674, 0.37824511528015137, 0.3699023723602295, 0.5494205951690674, 0.4566645622253418, 0.4026041030883789, 0.2672107219696045, 0.12020087242126465], [0.8414225578308105, 0.31694889068603516, 0.7841036319732666, 0.24434113502502441, 0.2555696964263916, 0.39739346504211426, 1.5680251121520996, 0.7441368103027344, 0.6984562873840332, 0.7286198139190674], [0.5809416770935059, 0.47678637504577637, 0.5346336364746094, 0.5080966949462891, 0.5715255737304688, 0.4297761917114258, 1.446850299835205, 0.4856903553009033, 0.4076564311981201, 0.5523316860198975], [0.5084178447723389, 0.3965775966644287, 0.5843150615692139, 0.8479077816009521, 0.39361572265625, 0.49884557723999023, 1.4492807388305664, 0.8392894268035889, 0.6236774921417236, 0.5654404163360596], [0.6444811820983887, 1.0834145545959473, 0.8158199787139893, 1.0756661891937256, 0.42035698890686035, 0.45527100563049316, 1.483630895614624, 0.6393866539001465, 0.9040906429290771, 0.722369909286499], [1.4209699630737305, 0.7459752559661865, 0.5244767665863037, 0.5401837825775146, 0.42992544174194336, 0.40848517417907715, 0.2964894771575928, 0.9596214294433594, 0.8533542156219482, 0.8139832019805908], [0.7953639030456543, 1.1409921646118164, 0.6725673675537109, 1.0878312587738037, 0.4473140239715576, 0.4884676933288574, 1.5581622123718262, 1.4014148712158203, 0.9713876247406006, 0.5663647651672363], [0.6492428779602051, 1.0911424160003662, 0.6471757888793945, 1.1790833473205566, 0.4506566524505615, 0.4921529293060303, 1.1187140941619873, 1.0409464836120605, 0.7441620826721191, 0.9244487285614014], [0.9080507755279541, 1.1013870239257812, 0.8688452243804932, 1.097719430923462, 0.4301893711090088, 0.45129942893981934, 1.4964983463287354, 1.0808742046356201, 0.7121896743774414, 0.7811298370361328], [1.4227254390716553, 1.127988338470459, 0.7183084487915039, 1.1163921356201172, 0.44892430305480957, 0.485823392868042, 1.768517255783081, 1.6046841144561768, 0.7030799388885498, 0.6634476184844971], [1.5312426090240479, 1.2477705478668213, 0.7274785041809082, 1.1272633075714111, 0.4423556327819824, 0.592806339263916, 0.879786491394043, 1.455887794494629, 0.7575786113739014, 0.8092734813690186], [1.4978969097137451, 1.1395070552825928, 0.6580033302307129, 1.2186126708984375, 0.47190284729003906, 0.4854147434234619, 1.4981460571289062, 1.46339750289917, 0.9476211071014404, 0.8460931777954102], [1.4126720428466797, 1.1378929615020752, 0.8553659915924072, 1.1391549110412598, 0.5250060558319092, 0.47640275955200195, 1.659452199935913, 1.6216845512390137, 1.0502781867980957, 1.1034188270568848], [1.218494176864624, 1.1636309623718262, 1.1078684329986572, 1.172013759613037, 0.5367648601531982, 0.51910400390625, 1.12044358253479, 1.5143678188323975, 1.3516428470611572, 1.300816535949707], [1.139383316040039, 1.2446768283843994, 1.1750662326812744, 1.142359972000122, 0.5313839912414551, 0.45885229110717773, 1.5255398750305176, 1.4564628601074219, 1.3953289985656738, 1.4060509204864502], [1.1354353427886963, 1.150294542312622, 1.0675127506256104, 1.2378642559051514, 0.4625968933105469, 0.4424870014190674, 1.6530210971832275, 1.6281895637512207, 1.6849830150604248, 1.3943541049957275], [1.1469104290008545, 1.1478750705718994, 1.1197278499603271, 1.1617610454559326, 0.514862060546875, 0.4964103698730469, 1.4910743236541748, 1.4809761047363281, 1.3703641891479492, 1.7257533073425293], [1.2295887470245361, 1.1659998893737793, 1.1349947452545166, 1.156449317932129, 0.5383858680725098, 0.5403361320495605, 1.2382051944732666, 1.479156494140625, 1.3947103023529053, 1.4170818328857422], [1.1371376514434814, 1.2509441375732422, 1.1529195308685303, 1.1564676761627197, 0.5323944091796875, 0.49818944931030273, 1.6814987659454346, 1.6035284996032715, 1.5673625469207764, 1.3976376056671143], [1.1458954811096191, 1.166475534439087, 1.2256274223327637, 1.2343313694000244, 0.5596377849578857, 0.6253166198730469, 1.6836810111999512, 1.4506101608276367, 1.422081708908081, 1.762976884841919], [1.15098237991333, 1.1565654277801514, 1.1451044082641602, 1.1514670848846436, 0.548762321472168, 0.5526943206787109, 1.4046688079833984, 1.4579946994781494, 1.4039459228515625, 1.454038143157959], [1.2326931953430176, 1.1661295890808105, 1.1425304412841797, 1.1723690032958984, 0.7858767509460449, 0.5669677257537842, 1.6630988121032715, 1.5596215724945068, 1.57356858253479, 1.4308476448059082], [1.1666288375854492, 1.23301362991333, 1.1571481227874756, 1.166269063949585, 0.6355211734771729, 0.5451698303222656, 1.531994342803955, 1.6816565990447998, 1.429952621459961, 1.7340998649597168], [1.1658051013946533, 1.1485848426818848, 1.2312915325164795, 1.251208782196045, 0.6530172824859619, 0.5384736061096191, 1.523547887802124, 1.4690673351287842, 1.4598734378814697, 1.4409422874450684], [1.1740686893463135, 1.1628379821777344, 1.1633343696594238, 1.1542088985443115, 1.3090262413024902, 0.5757765769958496, 1.678875207901001, 1.5763723850250244, 1.5468320846557617, 3.2100718021392822], [1.239992380142212, 1.149080514907837, 1.1700904369354248, 1.1630516052246094, 0.7277419567108154, 0.5878379344940186, 1.5322141647338867, 1.6978695392608643, 1.671921730041504, 3.201280355453491], [1.1541521549224854, 1.254607915878296, 1.1630001068115234, 1.16300368309021, 1.4650824069976807, 0.6333985328674316, 1.5297777652740479, 1.477102518081665, 1.4595434665679932, 1.780893325805664], [1.1439287662506104, 1.1553850173950195, 1.2462968826293945, 1.2498676776885986, 1.3389716148376465, 0.7943992614746094, 1.70377516746521, 1.6402654647827148, 1.5460779666900635, 1.4630331993103027], [1.1663424968719482, 1.1587581634521484, 1.1521258354187012, 1.1843440532684326, 1.3699910640716553, 0.6685168743133545, 1.544522762298584, 1.6270360946655273, 1.660099983215332, 1.4592061042785645], [1.2414236068725586, 1.1644847393035889, 1.1900122165679932, 1.2327353954315186, 1.380340814590454, 0.6184597015380859, 1.5476112365722656, 1.4806194305419922, 1.446645975112915, 1.7618107795715332], [1.1489546298980713, 1.239664077758789, 1.1985621452331543, 1.186661720275879, 1.5460419654846191, 0.7802639007568359, 1.9627132415771484, 1.5965311527252197, 1.4873683452606201, 1.4787969589233398], [1.1671969890594482, 1.1550722122192383, 1.223297357559204, 1.3371226787567139, 1.4299182891845703, 0.7802712917327881, 1.614140510559082, 1.6581459045410156, 1.708564281463623, 1.4694318771362305], [1.1488819122314453, 1.1900322437286377, 1.1538474559783936, 1.2126374244689941, 1.4176230430603027, 0.811089038848877, 1.559948205947876, 1.4646356105804443, 1.447317361831665, 1.7726655006408691], [1.2443947792053223, 1.1515882015228271, 1.1634602546691895, 1.231494426727295, 1.5586471557617188, 0.9726006984710693, 1.6928634643554688, 1.5858960151672363, 1.5109410285949707, 1.4840903282165527], [1.126746654510498, 1.2501513957977295, 1.198542594909668, 1.318580150604248, 1.4413988590240479, 0.897918701171875, 1.5312049388885498, 1.6149518489837646, 1.730602741241455, 1.462773084640503], [1.250871181488037, 1.1807141304016113, 1.215240240097046, 1.2262086868286133, 1.4775090217590332, 1.4240727424621582, 1.5255200862884521, 1.4714293479919434, 1.4581122398376465, 1.7902839183807373], [1.1481730937957764, 1.149869680404663, 1.193610429763794, 1.2128643989562988, 1.626708984375, 1.4477057456970215, 1.6940038204193115, 1.55824613571167, 1.4775633811950684, 1.4731359481811523], [1.2555725574493408, 1.1879363059997559, 1.1711595058441162, 1.2206971645355225, 1.4988980293273926, 1.6178021430969238, 1.5418500900268555, 1.6431937217712402, 1.7161891460418701, 1.4592969417572021], [1.1458919048309326, 1.2375679016113281, 1.2251486778259277, 1.337031602859497, 1.4985735416412354, 1.487335443496704, 1.5360760688781738, 1.4594507217407227, 1.4436471462249756, 1.7763066291809082], [1.170149803161621, 1.1648261547088623, 1.1814446449279785, 1.2201502323150635, 1.6648082733154297, 1.506986379623413, 1.68345046043396, 1.53629469871521, 1.453193187713623, 1.4645459651947021], [1.161055564880371, 1.1683313846588135, 1.152022123336792, 1.2208573818206787, 1.4991064071655273, 1.6515483856201172, 1.5289747714996338, 1.7006621360778809, 1.7573215961456299, 1.443864107131958], [1.2363193035125732, 1.195756196975708, 1.1504690647125244, 1.2116303443908691, 1.5198278427124023, 1.5252959728240967, 1.5487706661224365, 1.4891257286071777, 1.4517338275909424, 1.8063628673553467], [1.1484436988830566, 1.2334682941436768, 1.2540223598480225, 1.3592185974121094, 1.7376396656036377, 1.5214753150939941, 1.7134175300598145, 1.5510823726654053, 1.4878549575805664, 1.4489645957946777], [1.1603236198425293, 1.1690459251403809, 1.1647274494171143, 1.255235195159912, 1.5638701915740967, 1.678006649017334, 1.5491797924041748, 1.7005584239959717, 1.7378168106079102, 1.4621970653533936], [1.169445514678955, 1.1713225841522217, 1.1591675281524658, 1.2255630493164062, 1.523827314376831, 1.5371801853179932, 1.5389938354492188, 1.4700438976287842, 1.4481306076049805, 1.7851755619049072], [1.2551414966583252, 1.3229084014892578, 1.1578631401062012, 1.2442293167114258, 1.6937294006347656, 1.5546858310699463, 1.6896028518676758, 1.5498394966125488, 1.4672679901123047, 1.4608359336853027], [1.152817964553833, 1.1703336238861084, 1.27754807472229, 1.338491678237915, 1.5762970447540283, 1.7114276885986328, 1.5656392574310303, 1.6808483600616455, 1.7546427249908447, 1.4333834648132324], [1.1430962085723877, 1.16579008102417, 1.1647660732269287, 1.219987392425537, 1.5958056449890137, 1.5543348789215088, 1.551361322402954, 1.4572117328643799, 1.4378244876861572, 1.8044652938842773], [1.1668789386749268, 1.184168815612793, 1.1680281162261963, 1.2591490745544434, 2.0557398796081543, 1.552152395248413, 2.7094080448150635, 1.547790288925171, 1.468418836593628, 1.4325335025787354], [1.2531659603118896, 1.2851366996765137, 1.1930081844329834, 1.348005771636963, 1.5243091583251953, 1.7105796337127686, 1.6440248489379883, 1.71148681640625, 1.7748887538909912, 1.4488697052001953], [1.151658058166504, 1.1704447269439697, 1.2432365417480469, 1.2329230308532715, 1.557175874710083, 1.5494165420532227, 1.714686632156372, 1.446709156036377, 1.4582014083862305, 1.7860472202301025], [1.1433050632476807, 1.1783008575439453, 1.1691029071807861, 1.2420389652252197, 1.7181065082550049, 1.5225696563720703, 1.5574634075164795, 1.5395457744598389, 1.4630968570709229, 1.4988434314727783], [1.1419775485992432, 1.1561479568481445, 1.1412553787231445, 1.2401368618011475, 1.5427289009094238, 1.7117087841033936, 1.571164846420288, 1.7170703411102295, 1.7325387001037598, 1.4898872375488281], [1.225020408630371, 1.2343361377716064, 1.149345874786377, 1.3731367588043213, 1.60211181640625, 1.5767159461975098, 1.699559211730957, 1.4523937702178955, 1.461254358291626, 1.7874109745025635], [1.1480615139007568, 1.1627068519592285, 1.2455949783325195, 1.2405905723571777, 1.8480234146118164, 1.538923978805542, 1.5232410430908203, 1.5562183856964111, 1.4404094219207764, 1.4579038619995117], [1.1562929153442383, 1.1501624584197998, 1.16416335105896, 1.2591450214385986, 1.5555830001831055, 1.7149100303649902, 1.5590837001800537, 1.7103869915008545, 1.7469820976257324, 1.437847375869751], [1.1793031692504883, 1.1652917861938477, 1.1683034896850586, 1.2422852516174316, 1.6748807430267334, 1.5173194408416748, 1.711012363433838, 1.4492123126983643, 1.4619638919830322, 1.775136947631836], [1.2805955410003662, 1.2572426795959473, 1.1537797451019287, 1.3532154560089111, 1.5400567054748535, 1.5097684860229492, 1.5417547225952148, 1.53941011428833, 1.450190544128418, 1.4475431442260742], [1.157534122467041, 1.1803035736083984, 1.2369754314422607, 1.2677075862884521, 1.5450820922851562, 1.7036025524139404, 0.6882262229919434, 1.710188627243042, 1.7333290576934814, 1.4443767070770264], [1.1265547275543213, 1.1704246997833252, 1.1769335269927979, 1.2623610496520996, 1.6960020065307617, 1.540281057357788, 1.6599838733673096, 1.4356698989868164, 1.4416587352752686, 1.7666807174682617], [1.1335227489471436, 1.1612064838409424, 1.1556897163391113, 1.408433198928833, 1.538316249847412, 1.5693399906158447, 1.536451816558838, 1.5371379852294922, 1.453392744064331, 1.4623374938964844], [1.285693883895874, 1.2546677589416504, 1.1539463996887207, 1.256547451019287, 1.5882375240325928, 1.862346887588501, 1.521723985671997, 1.715195655822754, 1.7473888397216797, 1.4738514423370361], [1.1446588039398193, 1.1569123268127441, 1.2329235076904297, 1.2632663249969482, 1.7034540176391602, 1.524308443069458, 1.6633501052856445, 1.47367525100708, 1.4593265056610107, 1.7812180519104004], [1.1504313945770264, 1.141777515411377, 1.170379638671875, 1.268195390701294, 1.537397861480713, 1.6817348003387451, 1.5278887748718262, 1.5772335529327393, 1.4651505947113037, 1.4405417442321777], [1.1710612773895264, 1.164745569229126, 1.1483139991760254, 1.4107019901275635, 1.5344295501708984, 1.7228834629058838, 1.5348155498504639, 1.6779301166534424, 1.7648346424102783, 1.4387423992156982], [1.2376830577850342, 1.2563304901123047, 1.1630017757415771, 1.2736756801605225, 1.6861703395843506, 1.5228805541992188, 1.6795220375061035, 1.4457073211669922, 1.2864062786102295, 1.786088228225708], [1.1520335674285889, 1.1610140800476074, 1.2273387908935547, 1.267329216003418, 1.5488195419311523, 1.6886730194091797, 1.6110868453979492, 1.5622801780700684, 1.4512395858764648, 1.4393715858459473], [1.1533503532409668, 1.161116361618042, 1.150305986404419, 1.3132901191711426, 1.5252234935760498, 1.5022480487823486, 1.5273082256317139, 1.6844916343688965, 1.763162612915039, 1.4698023796081543], [1.153672695159912, 1.1605370044708252, 1.1644032001495361, 1.3919858932495117, 1.7254905700683594, 1.5379528999328613, 1.6572132110595703, 1.4428343772888184, 1.4425265789031982, 1.749255657196045], [1.2277257442474365, 1.2384240627288818, 1.160149335861206, 1.2875621318817139, 1.5430457592010498, 1.7025625705718994, 1.5292880535125732, 1.5371849536895752, 1.4502601623535156, 1.4391875267028809], [1.1742603778839111, 1.1643037796020508, 1.2412223815917969, 1.2839560508728027, 1.5621919631958008, 1.5273938179016113, 1.927170753479004, 1.697014570236206, 1.7590489387512207, 1.458855152130127], [1.1504285335540771, 1.1894078254699707, 1.1643459796905518, 1.4153988361358643, 1.7309823036193848, 1.5414600372314453, 2.200521469116211, 1.436112403869629, 1.4668493270874023, 1.7772431373596191], [1.148618221282959, 1.1740739345550537, 1.1655209064483643, 1.2869923114776611, 1.5375473499298096, 1.6791660785675049, 1.8265588283538818, 1.518038272857666, 1.4491069316864014, 1.453784704208374], [1.2641174793243408, 1.2415506839752197, 1.1570849418640137, 1.2792956829071045, 1.534475564956665, 1.5273826122283936, 1.7116730213165283, 1.7140827178955078, 1.7493479251861572, 1.4329674243927002], [1.1609525680541992, 1.154078722000122, 1.231924057006836, 1.290755033493042, 1.7266929149627686, 1.5556564331054688, 1.7109591960906982, 1.4574720859527588, 1.439110517501831, 1.7539892196655273], [1.1523950099945068, 1.1531918048858643, 1.1632575988769531, 1.446091890335083, 1.5366156101226807, 1.7037551403045654, 1.5487394332885742, 1.506514072418213, 1.4557545185089111, 1.428795337677002], [1.1643908023834229, 1.1644489765167236, 1.161515712738037, 1.322934865951538, 1.5728802680969238, 1.5694315433502197, 1.6852331161499023, 1.739811658859253, 1.7595891952514648, 1.4308624267578125], [1.2512500286102295, 1.252448558807373, 1.1516621112823486, 1.3299040794372559, 1.687631368637085, 1.5254008769989014, 1.5508744716644287, 1.4423019886016846, 1.434483528137207, 1.763305425643921], [1.1397411823272705, 1.1585354804992676, 1.2280333042144775, 1.4594085216522217, 1.527611255645752, 1.6858434677124023, 1.5562167167663574, 1.4928946495056152, 1.4651436805725098, 1.4426376819610596], [1.161592721939087, 1.1699330806732178, 1.1702282428741455, 1.3225398063659668, 4.868493318557739, 1.5250239372253418, 1.685441017150879, 1.7382864952087402, 1.7777440547943115, 1.4397156238555908], [1.1462302207946777, 1.1543664932250977, 1.1421773433685303, 1.293567180633545, 2.015699625015259, 1.2328815460205078, 1.5279016494750977, 1.44801926612854, 1.4628791809082031, 1.7633447647094727], [1.2403466701507568, 1.2521452903747559, 1.1432595252990723, 1.3217017650604248, 1.8642125129699707, 1.6736559867858887, 1.6668527126312256, 1.4835896492004395, 1.482419729232788, 1.446296215057373], [1.141000509262085, 1.1490449905395508, 1.2456943988800049, 1.4778857231140137, 1.5265870094299316, 1.4999725818634033, 1.7197678089141846, 1.712496280670166, 1.7754862308502197, 1.462911605834961], [1.169151782989502, 1.1554396152496338, 1.1574056148529053, 1.342790126800537, 1.6973025798797607, 1.5217700004577637, 1.5475718975067139, 1.449101448059082, 1.4489021301269531, 1.7529709339141846], [1.1678287982940674, 1.1528406143188477, 1.1570630073547363, 1.3374569416046143, 1.7375056743621826, 1.6754939556121826, 1.5628376007080078, 1.4574739933013916, 1.47465181350708, 1.450639009475708], [1.2470993995666504, 1.2400691509246826, 1.155475378036499, 1.4937102794647217, 1.5246086120605469, 1.5226118564605713, 1.707594871520996, 1.7566969394683838, 1.7668135166168213, 1.4750447273254395], [1.1407537460327148, 1.16121506690979, 1.2261757850646973, 1.351496696472168, 1.7321867942810059, 1.5180318355560303, 1.6076266765594482, 1.456942081451416, 1.4580669403076172, 1.7644975185394287], [1.1775963306427002, 1.146338939666748, 1.1535961627960205, 1.338597059249878, 1.5468683242797852, 1.6839284896850586, 1.749560832977295, 1.442091464996338, 1.4541277885437012, 1.4636225700378418], [1.1644313335418701, 1.1767044067382812, 1.1500396728515625, 1.4688193798065186, 1.542895793914795, 1.5138752460479736, 1.6364014148712158, 1.7558505535125732, 1.7550549507141113, 1.4368658065795898], [1.2265660762786865, 1.2404680252075195, 1.1494739055633545, 1.3964338302612305, 1.6792094707489014, 1.53438401222229, 1.5091216564178467, 1.4835970401763916, 1.4416460990905762, 1.7368621826171875], [1.136683702468872, 1.1477711200714111, 1.248060703277588, 1.3586044311523438, 1.529493808746338, 1.6636786460876465, 1.4954946041107178, 1.4574198722839355, 1.4445312023162842, 1.4844191074371338]]} \ No newline at end of file diff --git a/benchmarks/gym-cartpole/ppo.png b/benchmarks/gym-cartpole/ppo.png new file mode 100644 index 000000000..7642512f3 Binary files /dev/null and b/benchmarks/gym-cartpole/ppo.png differ diff --git a/data/active_flow_control.gif b/data/active_flow_control.gif new file mode 100644 index 000000000..284bc709f Binary files /dev/null and b/data/active_flow_control.gif differ diff --git a/data/adaptive_behavior_generation_for_autonomous_driving.png b/data/adaptive_behavior_generation_for_autonomous_driving.png new file mode 100644 index 000000000..5b842bb35 Binary files /dev/null and b/data/adaptive_behavior_generation_for_autonomous_driving.png differ diff --git a/data/bitcoin_trading_bot.png b/data/bitcoin_trading_bot.png new file mode 100644 index 000000000..1f59f93b3 Binary files /dev/null and b/data/bitcoin_trading_bot.png differ diff --git a/data/deepcrawl.gif b/data/deepcrawl.gif new file mode 100644 index 000000000..d1649c806 Binary files /dev/null and b/data/deepcrawl.gif differ diff --git a/data/navbot.gif b/data/navbot.gif new file mode 100644 index 000000000..cbbaffbdd Binary files /dev/null and b/data/navbot.gif differ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..5c2840c53 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Tensorforce +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/agents/a2c.rst b/docs/agents/a2c.rst new file mode 100644 index 000000000..4df270b78 --- /dev/null +++ b/docs/agents/a2c.rst @@ -0,0 +1,4 @@ +Advantage Actor-Critic +====================== + +.. autoclass:: tensorforce.agents.AdvantageActorCritic diff --git a/docs/agents/ac.rst b/docs/agents/ac.rst new file mode 100644 index 000000000..f1fea39b8 --- /dev/null +++ b/docs/agents/ac.rst @@ -0,0 +1,4 @@ +Actor-Critic +============ + +.. autoclass:: tensorforce.agents.ActorCritic diff --git a/docs/agents/agent.rst b/docs/agents/agent.rst new file mode 100644 index 000000000..dcc0bf8aa --- /dev/null +++ b/docs/agents/agent.rst @@ -0,0 +1,48 @@ +General agent interface +======================= + +Initialization and termination +------------------------------ + +.. automethod:: tensorforce.agents.TensorforceAgent.create +.. automethod:: tensorforce.agents.TensorforceAgent.reset +.. automethod:: tensorforce.agents.TensorforceAgent.close + +Reinforcement learning interface +-------------------------------- + +.. automethod:: tensorforce.agents.TensorforceAgent.act +.. automethod:: tensorforce.agents.TensorforceAgent.observe + +Get initial internals (for independent-act) +------------------------------------------- + +.. automethod:: tensorforce.agents.TensorforceAgent.initial_internals + +Experience - update interface +----------------------------- + +.. automethod:: tensorforce.agents.TensorforceAgent.experience +.. automethod:: tensorforce.agents.TensorforceAgent.update + +Pretraining +----------- + +.. automethod:: tensorforce.agents.TensorforceAgent.pretrain + +Loading and saving +------------------ + +.. automethod:: tensorforce.agents.TensorforceAgent.load +.. automethod:: tensorforce.agents.TensorforceAgent.save + +Tensor value tracking +--------------------- + +.. automethod:: tensorforce.agents.TensorforceAgent.tracked_tensors + +Specification and architecture +------------------------------ + +.. automethod:: tensorforce.agents.TensorforceAgent.get_specification +.. automethod:: tensorforce.agents.TensorforceAgent.get_architecture diff --git a/docs/agents/constant.rst b/docs/agents/constant.rst new file mode 100644 index 000000000..263795ed5 --- /dev/null +++ b/docs/agents/constant.rst @@ -0,0 +1,4 @@ +Constant Agent +============== + +.. autoclass:: tensorforce.agents.ConstantAgent diff --git a/docs/agents/double_dqn.rst b/docs/agents/double_dqn.rst new file mode 100644 index 000000000..1846c4187 --- /dev/null +++ b/docs/agents/double_dqn.rst @@ -0,0 +1,4 @@ +Double DQN +=========== + +.. autoclass:: tensorforce.agents.DoubleDQN diff --git a/docs/agents/dpg.rst b/docs/agents/dpg.rst new file mode 100644 index 000000000..007f4767d --- /dev/null +++ b/docs/agents/dpg.rst @@ -0,0 +1,4 @@ +Deterministic Policy Gradient +============================= + +.. autoclass:: tensorforce.agents.DeterministicPolicyGradient diff --git a/docs/agents/dqn.rst b/docs/agents/dqn.rst new file mode 100644 index 000000000..e3552e084 --- /dev/null +++ b/docs/agents/dqn.rst @@ -0,0 +1,4 @@ +Deep Q-Network +============== + +.. autoclass:: tensorforce.agents.DeepQNetwork diff --git a/docs/agents/dueling_dqn.rst b/docs/agents/dueling_dqn.rst new file mode 100644 index 000000000..a8387a0cf --- /dev/null +++ b/docs/agents/dueling_dqn.rst @@ -0,0 +1,4 @@ +Dueling DQN +=========== + +.. autoclass:: tensorforce.agents.DuelingDQN diff --git a/docs/agents/ppo.rst b/docs/agents/ppo.rst new file mode 100644 index 000000000..b00b5f9fe --- /dev/null +++ b/docs/agents/ppo.rst @@ -0,0 +1,4 @@ +Proximal Policy Optimization +============================ + +.. autoclass:: tensorforce.agents.ProximalPolicyOptimization diff --git a/docs/agents/random.rst b/docs/agents/random.rst new file mode 100644 index 000000000..dbe7b5559 --- /dev/null +++ b/docs/agents/random.rst @@ -0,0 +1,4 @@ +Random Agent +============ + +.. autoclass:: tensorforce.agents.RandomAgent diff --git a/docs/agents/tensorforce.rst b/docs/agents/tensorforce.rst new file mode 100644 index 000000000..4275306c9 --- /dev/null +++ b/docs/agents/tensorforce.rst @@ -0,0 +1,4 @@ +Tensorforce Agent +================= + +.. autoclass:: tensorforce.agents.TensorforceAgent diff --git a/docs/agents/trpo.rst b/docs/agents/trpo.rst new file mode 100644 index 000000000..5cadaa9a2 --- /dev/null +++ b/docs/agents/trpo.rst @@ -0,0 +1,4 @@ +Trust-Region Policy Optimization +================================ + +.. autoclass:: tensorforce.agents.TrustRegionPolicyOptimization diff --git a/docs/agents/vpg.rst b/docs/agents/vpg.rst new file mode 100644 index 000000000..0fa21c526 --- /dev/null +++ b/docs/agents/vpg.rst @@ -0,0 +1,4 @@ +Vanilla Policy Gradient +======================= + +.. autoclass:: tensorforce.agents.VanillaPolicyGradient diff --git a/docs/agents_models.md b/docs/agents_models.md deleted file mode 100644 index 5e6c061c8..000000000 --- a/docs/agents_models.md +++ /dev/null @@ -1,145 +0,0 @@ -Agent and model overview -======================== - -A reinforcement learning agent provides methods to process states and -return actions, to store past observations, and to load and save models. -Most agents employ a `Model` which implements the algorithms to -calculate the next action given the current state and to update model -parameters from past experiences. - -> Environment <-> Runner <-> Agent <-> Model - -Parameters to the agent are passed as a `Configuration` object. The -configuration is passed on to the `Model`. - -Ready-to-use algorithms ------------------------ - -We implemented some of the most common RL algorithms and try to keep -these up-to-date. Here we provide an overview over all implemented -agents and models. - -### Agent / General parameters - -`Agent` is the base class for all reinforcement learning agents. Every -agent inherits from this class. - -```eval_rst - .. autoclass:: tensorforce.agents.Agent - :noindex: - :show-inheritance: - :members: -``` - -### Model - -The `Model` class is the base class for reinforcement learning models. - -```eval_rst - .. autoclass:: tensorforce.models.Model - :noindex: - :show-inheritance: - :members: -``` - - -### MemoryAgent - - -```eval_rst - .. autoclass:: tensorforce.agents.MemoryAgent - :noindex: - :show-inheritance: - :members: -``` - - -### BatchAgent - - -```eval_rst - .. autoclass:: tensorforce.agents.BatchAgent - :noindex: - :show-inheritance: - :members: -``` - - -### Deep-Q-Networks (DQN) - -```eval_rst - .. autoclass:: tensorforce.agents.DQNAgent - :noindex: - :show-inheritance: - :members: -``` - - -### Normalized Advantage Functions - - -```eval_rst - .. autoclass:: tensorforce.agents.NAFAgent - :noindex: - :show-inheritance: - :members: -``` - -### Deep-Q-learning from demonstration (DQFD) - -```eval_rst - .. autoclass:: tensorforce.agents.DQFDAgent - :noindex: - :show-inheritance: - :members: -``` - -### Vanilla Policy Gradient - - -```eval_rst - .. autoclass:: tensorforce.agents.VPGAgent - :noindex: - :show-inheritance: - :members: -``` - -### Trust Region Policy Optimization (TRPO) - - -```eval_rst - .. autoclass:: tensorforce.agents.TRPOAgent - :noindex: - :show-inheritance: - :members: -``` - -State preprocessing -------------------- - -The agent handles state preprocessing. A preprocessor takes the raw state input -from the environment and modifies it (for instance, image resize, state -concatenation, etc.). You can find information about our ready-to-use -preprocessors [here](preprocessing.html). - - -Building your own agent ------------------------ - -If you want to build your own agent, it should always inherit from -`Agent`. If your agent uses a replay memory, it should probably inherit -from `MemoryAgent`, if it uses a batch replay that is emptied after each update, -it should probably inherit from `BatchAgent`. - -We distinguish between agents and models. The `Agent` class handles the -interaction with the environment, such as state preprocessing, exploration -and observation of rewards. The `Model` class handles the mathematical -operations, such as building the tensorflow operations, calculating the -desired action and updating (i.e. optimizing) the model weights. - -To start building your own agent, please refer to -[this blogpost](https://reinforce.io) to gain a deeper understanding of the -internals of the TensorForce library. Afterwards, have look on a sample -implementation, e.g. the [DQN Agent](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/agents/dqn_agent.py) -and [DQN Model](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/q_model.py). - diff --git a/docs/basics/agent-specification.md b/docs/basics/agent-specification.md new file mode 100644 index 000000000..1df13bd42 --- /dev/null +++ b/docs/basics/agent-specification.md @@ -0,0 +1,68 @@ +Agent specification +=================== + +Agents are instantiated via `Agent.create(agent=...)`, with either of the specification alternatives presented below (`agent` acts as `type` argument). It is recommended to pass as second argument `environment` the application `Environment` implementation, which automatically extracts the corresponding `states`, `actions` and `max_episode_timesteps` arguments of the agent. + + + +### States and actions specification + +A state/action value is specified as dictionary with mandatory attributes `type` (one of `'bool'`: binary, `'int'`: discrete, or `'float'`: continuous) and `shape` (a positive number or tuple thereof). Moreover, `'int'` values should additionally specify `num_values` (the fixed number of discrete options), whereas `'float'` values can specify bounds via `min/max_value`. If the state or action consists of multiple components, these are specified via an additional dictionary layer. The following example illustrates both possibilities: + +```python +states = dict( + observation=dict(type='float', shape=(16, 16, 3)), + attributes=dict(type='int', shape=(4, 2), num_values=5) +) +actions = dict(type='float', shape=10) +``` + +Note: Ideally, the agent arguments `states` and `actions` are specified implicitly by passing the `environment` argument. + + + +### How to specify modules + +##### Dictionary with module type and arguments +```python +Agent.create(... + policy=dict(network=dict(type='layered', layers=[dict(type='dense', size=32)])), + memory=dict(type='replay', capacity=10000), ... +) +``` + + +##### JSON specification file (plus additional arguments) +```python +Agent.create(... + policy=dict(network='network.json'), + memory=dict(type='memory.json', capacity=10000), ... +) +``` + + +##### Module path (plus additional arguments) +```python +Agent.create(... + policy=dict(network='my_module'), + memory=dict(type='tensorforce.core.memories.Replay', capacity=10000), ... +) +``` + + +##### Callable or Type (plus additional arguments) +```python +Agent.create(... + policy=dict(network=TestNetwork), + memory=dict(type=Replay, capacity=10000), ... +) +``` + + +##### Default module: only arguments or first argument +```python +Agent.create(... + policy=dict(network=[dict(type='dense', size=32)]), + memory=dict(capacity=10000), ... +) +``` diff --git a/docs/basics/features.md b/docs/basics/features.md new file mode 100644 index 000000000..7ef571a0a --- /dev/null +++ b/docs/basics/features.md @@ -0,0 +1,157 @@ +Features +======== + + +### Multi-input and non-sequential network architectures + +See [networks documentation](../modules/networks.html). + + + +### Abort-terminal due to timestep limit + +Besides `terminal=False` or `=0` for non-terminal and `terminal=True` or `=1` for true terminal, Tensorforce recognizes `terminal=2` as abort-terminal and handles it accordingly for reward estimation. Environments created via `Environment.create(..., max_episode_timesteps=?, ...)` will automatically return the appropriate terminal depending on whether an episode truly terminates or is aborted because it reached the time limit. + + + +### Action masking + +See also the [action-masking example](https://github.com/tensorforce/tensorforce/blob/master/examples/action_masking.py) for an environment implementation with built-in action masking. + +```python +agent = Agent.create( + states=dict(type='float', shape=(10,)), + actions=dict(type='int', shape=(), num_values=3), + ... +) +... +states = dict( + state=np.random.random_sample(size=(10,)), # state (default name: "state") + action_mask=[True, False, True] # mask as'[ACTION-NAME]_mask' (default name: "action") +) +action = agent.act(states=states) +assert action != 1 +``` + + + +### Parallel environment execution + +See also the [parallelization example](https://github.com/tensorforce/tensorforce/blob/master/examples/parallelization.py) for details on how to use this feature. + +Execute multiple environments running locally in one call / batched: + +```python +Runner( + agent='benchmarks/configs/ppo1.json', environment='CartPole-v1', + num_parallel=4 +) +runner.run(num_episodes=100, batch_agent_calls=True) +``` + +Execute environments running in different processes whenever ready / unbatched: + +```python +Runner( + agent='benchmarks/configs/ppo1.json', environment='CartPole-v1', + num_parallel=4, remote='multiprocessing' +) +runner.run(num_episodes=100) +``` + +Execute environments running on different machines, here using `run.py` instead +of `Runner`: + +```bash +# Environment machine 1 +python run.py --environment gym --level CartPole-v1 --remote socket-server \ + --port 65432 + +# Environment machine 2 +python run.py --environment gym --level CartPole-v1 --remote socket-server \ + --port 65433 + +# Agent machine +python run.py --agent benchmarks/configs/ppo1.json --episodes 100 \ + --num-parallel 2 --remote socket-client --host 127.0.0.1,127.0.0.1 \ + --port 65432,65433 --batch-agent-calls +``` + + + +### Vectorized environment + +See the [vectorized environment example](https://github.com/tensorforce/tensorforce/blob/master/examples/vectorized_environment.py) for details on how to use this feature. + + + +### Multi-actor environment + +See the [multi-actor environment example](https://github.com/tensorforce/tensorforce/blob/master/examples/multiactor_environment.py) for details on how to use this feature. + + + +### Save & restore + +##### TensorFlow saver (full model) + +```python +agent = Agent.create(... + saver=dict( + directory='data/checkpoints', + frequency=100 # save checkpoint every 100 updates + ), ... +) +... +agent.close() + +# Restore latest agent checkpoint +agent = Agent.load(directory='data/checkpoints') +``` + +See also the [save-load example](https://github.com/tensorforce/tensorforce/blob/master/examples/save_load_agent.py). + + +##### NumPy / HDF5 (only weights) + +```python +agent = Agent.create(...) +... +agent.save(directory='data/checkpoints', format='numpy', append='episodes') + +# Restore latest agent checkpoint +agent = Agent.load(directory='data/checkpoints', format='numpy') +``` + +See also the [save-load example](https://github.com/tensorforce/tensorforce/blob/master/examples/save_load_agent.py). + + +##### SavedModel export + +See the [SavedModel example](https://github.com/tensorforce/tensorforce/blob/master/examples/export_saved_model.py) for details on how to use this feature. + + + +### TensorBoard + +```python +Agent.create(... + summarizer=dict( + directory='data/summaries', + # list of labels, or 'all' + labels=['entropy', 'kl-divergence', 'loss', 'reward', 'update-norm'] + ), ... +) +``` + + + +### Act-experience-update interaction + +Instead of the default act-observe interaction pattern or the [Runner utility](../execution/runner.html), one can alternatively use the act-experience-update interface, which allows for more control over the experience the agent stores. See the [act-experience-update example](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) for details on how to use this feature. Note that a few stateful network layers will not be updated correctly in independent-mode (currently, `exponential_normalization`). + + + +### Record & pretrain + +See the [record-and-pretrain example](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) for details on how to use this feature. diff --git a/docs/basics/getting-started.md b/docs/basics/getting-started.md new file mode 100644 index 000000000..f10d80873 --- /dev/null +++ b/docs/basics/getting-started.md @@ -0,0 +1,274 @@ +Getting started +=============== + + +[Quickstart example](https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py) + + +### Initializing an environment + +It is recommended to initialize an environment via the `Environment.create(...)` [interface](../environments/environment.html). + +```python +from tensorforce.environments import Environment +``` + +For instance, the [OpenAI CartPole environment](../environments/openai_gym.html) can be initialized as follows (see environment docs for available environments and arguments): + +```python +environment = Environment.create( + environment='gym', level='CartPole', max_episode_timesteps=500 +) +``` + +Gym's pre-defined versions are also accessible: + +```python +environment = Environment.create(environment='gym', level='CartPole-v1') +``` + +Alternatively, an environment can be specified as a config file: + +```json +{ + "environment": "gym", + "level": "CartPole" +} +``` + +Environment config files can be loaded by passing their file path: + +```python +environment = Environment.create( + environment='environment.json', max_episode_timesteps=500 +) +``` + +Custom Gym environments can be used in the same way, but require the corresponding class(es) to be imported and registered accordingly. + +Finally, it is possible to implement a custom environment using Tensorforce's `Environment` interface: + +```python +class CustomEnvironment(Environment): + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='float', shape=(8,)) + + def actions(self): + return dict(type='int', num_values=4) + + # Optional: should only be defined if environment has a natural fixed + # maximum episode length; otherwise specify maximum number of training + # timesteps via Environment.create(..., max_episode_timesteps=???) + def max_episode_timesteps(self): + return super().max_episode_timesteps() + + # Optional additional steps to close environment + def close(self): + super().close() + + def reset(self): + state = np.random.random(size=(8,)) + return state + + def execute(self, actions): + next_state = np.random.random(size=(8,)) + terminal = False # Always False if no "natural" terminal state + reward = np.random.random() + return next_state, terminal, reward +``` + +Custom environment implementations can be loaded by passing either the environment object itself: + +```python +environment = Environment.create( + environment=CustomEnvironment, max_episode_timesteps=100 +) +``` + +or its module path (e.g., assuming the class is defined in file `envs/custom_env.py`): + +```python +environment = Environment.create( + environment='envs.custom_env', max_episode_timesteps=100 +) +``` + +It is generally recommended to specify the `max_episode_timesteps` argument of `Environment.create(...)` (at least for training), as some agent parameters may rely on this value. + + + + +### Initializing an agent + +Similarly to environments, it is recommended to initialize an agent via the `Agent.create(...)` [interface](../agents/agent.html). + +```python +from tensorforce.agents import Agent +``` + +For instance, the [generic Tensorforce agent](../agents/tensorforce.html) can be initialized as follows (see agent docs for available agents and arguments): + +```python +agent = Agent.create( + agent='tensorforce', environment=environment, update=64, + optimizer=dict(optimizer='adam', learning_rate=1e-3), + objective='policy_gradient', reward_estimation=dict(horizon=20) +) +``` + +Other pre-defined agent classes can alternatively be used, for instance, [Proximal Policy Optimization](../agents/ppo.html): + +```python +agent = Agent.create( + agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3 +) +``` + +Alternatively, an agent can be specified as a config file: + +```json +{ + "agent": "tensorforce", + "update": 64, + "optimizer": { + "optimizer": "adam", + "learning_rate": 1e-3 + }, + "objective": "policy_gradient", + "reward_estimation": { + "horizon": 20 + } +} +``` + +Agent config files can be loaded by passing their file path: + +```python +agent = Agent.create(agent='agent.json', environment=environment) +``` + +While it is possible to specify the agent arguments `states`, `actions` and `max_episode_timesteps`, it is generally recommended to specify the `environment` argument instead (which will automatically infer the other values accordingly), by passing the environment object as returned by `Environment.create(...)`. + + + + +### Training and evaluation + +It is recommended to use the execution utilities for training and evaluation, like the [Runner utility](../execution/runner.html), which offer a range of configuration options: + +```python +from tensorforce.execution import Runner +``` + +A basic experiment consisting of training and subsequent evaluation can be written in a few lines of code: + +```python +runner = Runner( + agent='agent.json', + environment=dict(environment='gym', level='CartPole'), + max_episode_timesteps=500 +) + +runner.run(num_episodes=200) + +runner.run(num_episodes=100, evaluation=True) + +runner.close() +``` + +The same interface also makes it possible to run experiments involving multiple parallelized environments: + +```python +runner = Runner( + agent='agent.json', + environment=dict(environment='gym', level='CartPole'), + max_episode_timesteps=500, + num_parallel=5, remote='multiprocessing' +) + +runner.run(num_episodes=100) + +runner.close() +``` + +Note that in this case both agent and environment are created as part of `Runner`, not via `Agent.create(...)` and `Environment.create(...)`. If agent and environment are specified separately, the user is required to take care of passing the agent arguments `environment` and `parallel_interactions` (in the parallelized case) as well as closing both agent and environment separately at the end. + +The execution utility classes take care of handling the agent-environment interaction correctly, and thus should be used where possible. Alternatively, if more detailed control over the agent-environment interaction is required, a simple training loop can be defined as follows, using the act-observe interaction pattern (see also the [act-observe example](https://github.com/tensorforce/tensorforce/blob/master/examples/act_observe_interface.py)): + +```python +# Create agent and environment +environment = Environment.create( + environment='environment.json', max_episode_timesteps=500 +) +agent = Agent.create(agent='agent.json', environment=environment) + +# Train for 100 episodes +for _ in range(100): + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) +``` + +Alternatively, the act-experience-update interface offers even more flexibility (see also the [act-experience-update example](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py)), however, note that a few stateful network layers will not be updated correctly in independent-mode (currently, `exponential_normalization`): + +```python +# Train for 100 episodes +for _ in range(100): + episode_states = list() + episode_internals = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + episode_states.append(states) + episode_internals.append(internals) + actions, internals = agent.act( + states=states, internals=internals, independent=True + ) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + + agent.experience( + states=episode_states, internals=episode_internals, + actions=episode_actions, terminal=episode_terminal, + reward=episode_reward + ) + agent.update() +``` + +Finally, the evaluation loop can be defined as follows: + +```python +# Evaluate for 100 episodes +sum_rewards = 0.0 +for _ in range(100): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, + independent=True, deterministic=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + +print('Mean episode reward:', sum_rewards / 100) + +# Close agent and environment +agent.close() +environment.close() +``` diff --git a/docs/basics/installation.md b/docs/basics/installation.md new file mode 100644 index 000000000..7075633ff --- /dev/null +++ b/docs/basics/installation.md @@ -0,0 +1,60 @@ +Installation +============ + + +A stable version of Tensorforce is periodically updated on PyPI and installed as follows: + +```bash +pip3 install tensorforce +``` + +To always use the latest version of Tensorforce, install the GitHub version instead: + +```bash +git clone https://github.com/tensorforce/tensorforce.git +cd tensorforce +pip3 install -e . +``` + +Environments require additional packages for which there are setup options available (`ale`, `gym`, `retro`, `vizdoom`, `carla`; or `envs` for all environments), however, some require additional tools to be installed separately (see [environments documentation](http://tensorforce.readthedocs.io)). Other setup options include `tfa` for [TensorFlow Addons](https://www.tensorflow.org/addons) and `tune` for [HpBandSter](https://github.com/automl/HpBandSter) required for the `tune.py` script. + + +**Note on GPU usage:** Different from (un)supervised deep learning, RL does not always benefit from running on a GPU, depending on environment and agent configuration. In particular for RL-typical environments with low-dimensional state spaces (i.e., no images), one usually gets better performance by running on CPU only. Consequently, Tensorforce is configured to run on CPU by default, which can be changed via the agent's `config` argument, for instance, `config=dict(device='GPU')`. + + +**M1 Macs** + +At the moment Tensorflow cannot be installed on M1 Macs directly. You need to follow [Apple's guide](https://developer.apple.com/metal/tensorflow-plugin/) to install `tensorflow-macos` instead. + +Then, since Tensorforce has `tensorflow` as its dependency and not `tensorflow-macos`, you need to install all Tensorforce's dependencies from [requirements.txt](https://github.com/tensorforce/tensorforce/blob/master/requirements.txt) manually (except for `tensorflow == 2.5.0` of course). + +In the end, install tensorforce while forcing pip to ignore its dependencies: +``` +pip3 install tensorforce --no-deps +``` + + +**Dockerfile** + +If you want to use Tensorforce within a Docker container, the following is a minimal `Dockerfile` to get started: + +``` +FROM python:3.8 +RUN \ + pip3 install tensorforce +``` + +Or alternatively for the latest version: + +``` +FROM python:3.8 +RUN \ + git clone https://github.com/tensorforce/tensorforce.git && \ + pip3 install -e tensorforce +``` + +Subsequently, the container can be built via: + +```bash +docker build . +``` diff --git a/docs/basics/run.md b/docs/basics/run.md new file mode 100644 index 000000000..2f6bc233a --- /dev/null +++ b/docs/basics/run.md @@ -0,0 +1,70 @@ +run.py -- Runner +================ + + +###### Agent arguments + +**-\-[a]gent** (*string*, **required** *unless "socket-server" remote mode*) -- Agent (name, configuration JSON file, or library module) +
+**-\-[c]heckpoints** (*string, default: not specified*) -- TensorFlow checkpoints directory, plus optional comma-separated filename +
+**-\-[s]ummaries** (*string, default: not specified*) -- TensorBoard summaries directory, plus optional comma-separated filename +
+**-\-recordings** (*string, default: not specified*) -- Traces recordings directory + + +###### Environment arguments + +**-\-[e]nvironment** (*string*, **required** *unless "socket-client" remote mode*) -- Environment (name, configuration JSON file, or library module) +
+**-\-[l]evel** (*string, default: not specified*) -- Level or game id, like `CartPole-v1`, if supported +
+**-\-[m]ax-episode-timesteps** (*int, default: not specified*) -- Maximum number of timesteps per episode +
+**-\-visualize** (*bool, default: false*) -- Visualize agent--environment interaction, if supported +
+**-\-visualize-directory** (*bool, default: not specified*) -- Directory to store videos of agent--environment interaction, if supported +
+**-\-import-modules** (*string, default: not specified*) -- Import comma-separated modules required for environment + + +###### Parallel execution arguments + +**-\-num-parallel** (*int, default: no parallel execution*) -- Number of environment instances to execute in parallel +
+**-\-batch-agent-calls** (*bool, default: false*) -- Batch agent calls for parallel environment execution +
+**-\-sync-timesteps** (*bool, default: false*) -- Synchronize parallel environment execution on timestep-level +
+**-\-sync-episodes** (*bool, default: false*) -- Synchronize parallel environment execution on episode-level +
+**-\-remote** (*str, default: local execution*) -- Communication mode for remote environment execution of parallelized environment execution: *"multiprocessing"* | *"socket-client"* | *"socket-server"*. In case of *"socket-server"*, runs environment in server communication loop until closed. +
+**-\-blocking** (*bool, default: false*) -- Remote environments should be blocking +
+**-\-host** (*str, only for "socket-client" remote mode*) -- Socket server hostname(s) or IP address(es), single value or comma-separated list +
+**-\-port** (*str, only for "socket-client/server" remote mode*) -- Socket server port(s), single value or comma-separated list, increasing sequence if single host and port given + + +###### Runner arguments + +**-\-e[v]aluation** (*bool, default: false*) -- Run environment (last if multiple) in evaluation mode +
+**-\-episodes [n]** (*int, default: not specified*) -- Number of episodes +
+**-\-[t]imesteps** (*int, default: not specified*) -- Number of timesteps +
+**-\-[u]pdates** (*int, default: not specified*) -- Number of agent updates +
+**-\-mean-horizon** (*int, default: 1*) -- Number of episodes progress bar values and evaluation score are averaged over +
+**-\-save-best-agent** (*bool, default: false*) -- Directory to save the best version of the agent according to the evaluation score + +###### Logging arguments + +**-\-[r]epeat** (*int, default: 1*) -- Number of repetitions +
+**-\-path** (*string, default: not specified*) -- Logging path, directory plus filename without extension + +**-\-seaborn** (*bool, default: false*) -- Use seaborn diff --git a/docs/basics/tune.md b/docs/basics/tune.md new file mode 100644 index 000000000..904aeb7fc --- /dev/null +++ b/docs/basics/tune.md @@ -0,0 +1,37 @@ +tune.py -- Hyperparameter tuner +=============================== + +Uses the [BOHB optimizer (Bayesian Optimization and Hyperband)](https://github.com/automl/HpBandSter) internally. + + +###### Environment arguments + +**-\-[e]nvironment** (*string*, **required**) -- Environment (name, configuration JSON file, or library module) +
+**-\-[l]evel** (*string, default: not specified*) -- Level or game id, like `CartPole-v1`, if supported +
+**-\-[m]ax-episode-timesteps** (*int, default: not specified*) -- Maximum number of timesteps per episode +
+**-\-import-modules** (*string, default: not specified*) -- Import comma-separated modules required for environment + + +###### Runner arguments + +**-\-episodes [n]** (*int*, **required**) -- Number of episodes +
+**-\-num-[p]arallel** (*int, default: no parallel execution*) -- Number of environment instances to execute in parallel + + +##### Tuner arguments + +**-\-[r]uns-per-round** (*string, default: 1,2,5,10*) -- Comma-separated number of runs per optimization round, each with a successively smaller number of candidates +
+**-\-[s]election-factor** (*int, default: 3*) -- Selection factor n, meaning that one out of n candidates in each round advances to the next optimization round +
+**-\-num-[i]terations** (*int, default: 1*) -- Number of optimization iterations, each consisting of a series of optimization rounds with an increasingly reduced candidate pool +
+**-\-[d]irectory** (*string, default: "tuner"*) -- Output directory +
+**-\-restore** (*string, default: not specified*) -- Restore from given directory +
+**-\-id** (*string, default: "worker"*) -- Unique worker id diff --git a/docs/conf.py b/docs/conf.py index de79b7891..200f68963 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,8 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# TensorForce documentation build configuration file, created by -# sphinx-quickstart on Sun Mar 19 22:09:11 2017. +# Tensorforce documentation build configuration file, created by +# sphinx-quickstart. # # This file is execfile()d with the current directory set to its # containing dir. @@ -17,64 +18,61 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # -# SPHINX_APIDOC_OPTIONS=members,undoc-members,inherited-members,show-inheritance sphinx-apidoc /data/coding/reinforce.io/tensorforce -o tensorforce +# SPHINX_APIDOC_OPTIONS=members,undoc-members,inherited-members,show-inheritance sphinx-apidoc /data/coding/tensorforce/tensorforce -o tensorforce import os import sys -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(1, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('..')) # import CommonMark -from recommonmark.transform import AutoStructify from m2r import M2R +from recommonmark.transform import AutoStructify + +import tensorforce # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -# # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.githubpages', 'sphinx.ext.autodoc', 'sphinx.ext.napoleon'] +extensions = [ + 'recommonmark', + 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.todo', + 'sphinx.ext.coverage', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon' +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -source_parsers = { - '.md': 'recommonmark.parser.CommonMarkParser', -} - # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] source_suffix = ['.rst', '.md'] # The master toctree document. master_doc = 'index' # General information about the project. -project = u'TensorForce' -copyright = u'2017, reinforce.io' -author = u'reinforce.io' +project = 'Tensorforce' +copyright = '2018, Tensorforce Team' +author = 'Tensorforce Team' -github_doc_root = 'https://github.com/reinforceio/tensorforce/tree/master/docs/' +github_doc_root = 'https://github.com/tensorforce/tensorforce/tree/master/docs/' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -# + # The short X.Y version. -version = u'0.3.3' +version = tensorforce.__version__ # The full version, including alpha/beta/rc tags. -release = u'0.3.3' +release = tensorforce.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None @@ -90,30 +88,36 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False -# autoclass_content = 'both' - -autodoc_mock_imports = ['go_vncdriver', 'tensorflow', 'deepmind_lab', 'universe.spaces', 'gym.spaces.discrete', 'gym.wrappers', - 'mazeexp', 'ale_python_interface', 'msgpack', 'msgpack_numpy', 'cached_property', - 'tensorflow.python.training.adadelta', 'tensorflow.python.training.adagrad', 'tensorflow.python.training.adam', - 'tensorflow.python.training.gradient_descent', 'tensorflow.python.training.momentum', 'tensorflow.python.training.rmsprop', - 'tensorflow.core.util.event_pb2'] - +# Napoleon settings napoleon_google_docstring = True -napoleon_numpy_docstring = False +napoleon_numpy_docstring = True napoleon_include_init_with_doc = False +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True + +# TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# autodoc_mock_imports = ['go_vncdriver', 'tensorflow', 'deepmind_lab', 'universe.spaces', 'gym.spaces.discrete', 'gym.wrappers', +# 'mazeexp', 'ale_python_interface', 'msgpack', 'msgpack_numpy', 'cached_property', +# 'tensorflow.python.training.adadelta', 'tensorflow.python.training.adagrad', 'tensorflow.python.training.adam', +# 'tensorflow.python.training.gradient_descent', 'tensorflow.python.training.momentum', 'tensorflow.python.training.rmsprop', +# 'tensorflow.core.util.event_pb2'] # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -# html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, @@ -125,7 +129,7 @@ # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'TensorForcedoc' +htmlhelp_basename = 'Tensorforcedoc' # -- Options for LaTeX output --------------------------------------------- @@ -152,8 +156,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'TensorForce.tex', u'TensorForce Documentation', - u'reinforce.io', 'manual'), + (master_doc, 'Tensorforce.tex', 'Tensorforce Documentation', 'Tensorforce Team', 'manual') ] @@ -161,10 +164,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'tensorforce', u'TensorForce Documentation', - [author], 1) -] +man_pages = [(master_doc, 'tensorforce', 'Tensorforce Documentation', [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -172,16 +172,20 @@ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'TensorForce', u'TensorForce Documentation', - author, 'TensorForce', 'One line description of project.', - 'Miscellaneous'), -] +texinfo_documents = [( + master_doc, 'Tensorforce', 'Tensorforce Documentation', author, 'Tensorforce', + 'One line description of project.', 'Miscellaneous' +)] + + +# -- Extension configuration ------------------------------------------------- m2r = M2R() + + def process_docstring(app, what, name, obj, options, lines): """Enable markdown syntax in docstrings""" - + markdown = "\n".join(lines) # ast = cm_parser.parse(markdown) @@ -193,20 +197,13 @@ def process_docstring(app, what, name, obj, options, lines): lines.extend(rest.split("\n")) -# https://stackoverflow.com/a/5599712 -def dont_skip_init(app, what, name, obj, skip, options): - if name == "__init__": - return False - return skip +recommonmark_config = dict( + url_resolver=(lambda url: github_doc_root + url), auto_toc_tree_section='Contents', + enable_eval_rst=True +) def setup(app): - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: url, # lambda url: github_doc_root + url, - 'auto_toc_tree_section': 'Contents', - 'enable_eval_rst': True, - 'enable_auto_doc_ref': True, - }, True) + app.add_config_value('recommonmark_config', recommonmark_config, True) app.add_transform(AutoStructify) app.connect('autodoc-process-docstring', process_docstring) - app.connect("autodoc-skip-member", dont_skip_init) diff --git a/docs/environments.md b/docs/environments.md deleted file mode 100644 index 0e0b9e718..000000000 --- a/docs/environments.md +++ /dev/null @@ -1,59 +0,0 @@ -Environments -============ - -A reinforcement learning environment provides the API to a simulated or real -environment as the subject for optimization. It could be anything from -video games (e.g. Atari) to robots or trading systems. The agent interacts -with this environment and learns to act optimally in its dynamics. - -> Environment <-> Runner <-> Agent <-> Model - -```eval_rst - .. autoclass:: tensorforce.environments.Environment - :members: - :noindex: -``` - - -Ready-to-use environments -------------------------- - -### OpenAI Gym - -```eval_rst - .. autoclass:: tensorforce.contrib.openai_gym.OpenAIGym - :noindex: - :show-inheritance: - :members: - :special-members: __init__ -``` - -### OpenAI Universe - -```eval_rst - .. autoclass:: tensorforce.contrib.openai_universe.OpenAIUniverse - :noindex: - :show-inheritance: - :members: - :special-members: __init__ -``` - -### Deepmind Lab - -```eval_rst - .. autoclass:: tensorforce.contrib.deepmind_lab.DeepMindLab - :noindex: - :show-inheritance: - :members: - :special-members: __init__ -``` - -### Unreal Engine 4 Games - -```eval_rst - .. autoclass:: tensorforce.contrib.unreal_engine.UE4Environment - :noindex: - :show-inheritance: - :members: - :special-members: __init__ -``` diff --git a/docs/environments/ale.rst b/docs/environments/ale.rst new file mode 100644 index 000000000..6e93cb4be --- /dev/null +++ b/docs/environments/ale.rst @@ -0,0 +1,4 @@ +Arcade Learning Environment +=========================== + +.. autoclass:: tensorforce.environments.ArcadeLearningEnvironment diff --git a/docs/environments/environment.rst b/docs/environments/environment.rst new file mode 100644 index 000000000..1902762bb --- /dev/null +++ b/docs/environments/environment.rst @@ -0,0 +1,21 @@ +General environment interface +============================= + +Initialization and termination +------------------------------ + +.. automethod:: tensorforce.environments.Environment.create +.. automethod:: tensorforce.environments.Environment.close + +Properties +---------- + +.. automethod:: tensorforce.environments.Environment.states +.. automethod:: tensorforce.environments.Environment.actions +.. automethod:: tensorforce.environments.Environment.max_episode_timesteps + +Interaction functions +--------------------- + +.. automethod:: tensorforce.environments.Environment.reset +.. automethod:: tensorforce.environments.Environment.execute diff --git a/docs/environments/open_sim.rst b/docs/environments/open_sim.rst new file mode 100644 index 000000000..9fa0002bc --- /dev/null +++ b/docs/environments/open_sim.rst @@ -0,0 +1,4 @@ +Open Sim +======== + +.. autoclass:: tensorforce.environments.OpenSim diff --git a/docs/environments/openai_gym.rst b/docs/environments/openai_gym.rst new file mode 100644 index 000000000..2d089748a --- /dev/null +++ b/docs/environments/openai_gym.rst @@ -0,0 +1,4 @@ +OpenAI Gym +========== + +.. autoclass:: tensorforce.environments.OpenAIGym diff --git a/docs/environments/openai_retro.rst b/docs/environments/openai_retro.rst new file mode 100644 index 000000000..d7bad70e9 --- /dev/null +++ b/docs/environments/openai_retro.rst @@ -0,0 +1,4 @@ +OpenAI Retro +============ + +.. autoclass:: tensorforce.environments.OpenAIRetro diff --git a/docs/environments/ple.rst b/docs/environments/ple.rst new file mode 100644 index 000000000..70bae664a --- /dev/null +++ b/docs/environments/ple.rst @@ -0,0 +1,4 @@ +PyGame Learning Environment +=========================== + +.. autoclass:: tensorforce.environments.PyGameLearningEnvironment diff --git a/docs/environments/vizdoom.rst b/docs/environments/vizdoom.rst new file mode 100644 index 000000000..0cfab5ce9 --- /dev/null +++ b/docs/environments/vizdoom.rst @@ -0,0 +1,4 @@ +ViZDoom +======= + +.. autoclass:: tensorforce.environments.ViZDoom diff --git a/docs/execution/runner.rst b/docs/execution/runner.rst new file mode 100644 index 000000000..e0503e8ab --- /dev/null +++ b/docs/execution/runner.rst @@ -0,0 +1,5 @@ +Runner utility +============== + +.. autoclass:: tensorforce.execution.Runner + :members: run diff --git a/docs/index.rst b/docs/index.rst index 7c8f1940b..842e463ad 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,112 +1,77 @@ -*TensorForce - modular deep reinforcement learning in TensorFlow* -================================================================================= - -TensorForce is an open source reinforcement learning library focused on -providing clear APIs, readability and modularisation to deploy -reinforcement learning solutions both in research and practice. -TensorForce is built on top on TensorFlow. - -Quick start ------------ - -For a quick start, you can run one of our example scripts using the -provided configurations, e.g. to run the TRPO agent on CartPole, execute -from the examples folder: - -.. code:: bash - - python examples/openai_gym.py CartPole-v0 -a examples/configs/ppo.json -n examples/configs/mlp2_network.json - - -In python, it could look like this: - -.. code:: python - - # examples/quickstart.py - - import numpy as np - - from tensorforce.agents import PPOAgent - from tensorforce.execution import Runner - from tensorforce.contrib.openai_gym import OpenAIGym - - # Create an OpenAIgym environment - env = OpenAIGym('CartPole-v0', visualize=True) - - # Network as list of layers - network_spec = [ - dict(type='dense', size=32, activation='tanh'), - dict(type='dense', size=32, activation='tanh') - ] - - agent = PPOAgent( - states_spec=env.states, - actions_spec=env.actions, - network_spec=network_spec, - batch_size=4096, - # BatchAgent - keep_last_timestep=True, - # PPOAgent - step_optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - optimization_steps=10, - # Model - scope='ppo', - discount=0.99, - # DistributionModel - distributions_spec=None, - entropy_regularization=0.01, - # PGModel - baseline_mode=None, - baseline=None, - baseline_optimizer=None, - gae_lambda=None, - # PGLRModel - likelihood_ratio_clipping=0.2, - summary_spec=None, - distributed_spec=None - ) - - # Create the runner - runner = Runner(agent=agent, environment=env) - - - # Callback function printing episode statistics - def episode_finished(r): - print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, - reward=r.episode_rewards[-1])) - return True - - - # Start learning - runner.run(episodes=3000, max_episode_timesteps=200, episode_finished=episode_finished) - runner.close() - - # Print statistics - print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format( - ep=runner.episode, - ar=np.mean(runner.episode_rewards[-100:])) - ) +Tensorforce: a TensorFlow library for applied reinforcement learning +==================================================================== + +Tensorforce is an open-source deep reinforcement learning framework, with an emphasis on modularized flexible library design and straightforward usability for applications in research and practice. Tensorforce is built on top of `Google's TensorFlow framework `_ and requires Python 3. + +Tensorforce follows a set of high-level design choices which differentiate it from other similar libraries: + +- **Modular component-based design**: Feature implementations, above all, strive to be as generally applicable and configurable as possible, potentially at some cost of faithfully resembling details of the introducing paper. +- **Separation of RL algorithm and application**: Algorithms are agnostic to the type and structure of inputs (states/observations) and outputs (actions/decisions), as well as the interaction with the application environment. +- **Full-on TensorFlow models**: The entire reinforcement learning logic, including control flow, is implemented in TensorFlow, to enable portable computation graphs independent of application programming language, and to facilitate the deployment of models. + .. toctree:: - :maxdepth: 2 - :caption: Contents: - - agents_models - environments - preprocessing - summary_spec - runner - tensorforce/tensorforce + :maxdepth: 0 + :caption: Basics + basics/installation + basics/getting-started + basics/agent-specification + basics/features + basics/run + basics/tune -More information ----------------- -You can find more information at our `TensorForce GitHub repository `__. +.. toctree:: + :maxdepth: 0 + :caption: Agents + + agents/agent + agents/constant + agents/random + agents/tensorforce + agents/vpg + agents/ppo + agents/trpo + agents/dpg + agents/dqn + agents/double_dqn + agents/dueling_dqn + agents/ac + agents/a2c + + +.. toctree:: + :maxdepth: 1 + :caption: Modules -We have a seperate repository available for benchmarking our algorithm implementations -[here](https://github.com/reinforceio/tensorforce-benchmark). + modules/distributions + modules/layers + modules/memories + modules/networks + modules/objectives + modules/optimizers + modules/parameters + modules/policies + modules/preprocessing + + +.. toctree:: + :maxdepth: 0 + :caption: Execution + + execution/runner + + +.. toctree:: + :maxdepth: 0 + :caption: Environments + + environments/environment + environments/openai_gym + environments/ale + environments/openai_retro + environments/open_sim + environments/ple + environments/vizdoom diff --git a/docs/m2r.py b/docs/m2r.py deleted file mode 100644 index 1ac8f6082..000000000 --- a/docs/m2r.py +++ /dev/null @@ -1,583 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# https://github.com/miyakogi/m2r -# Licensed under the MIT License - -from __future__ import print_function, unicode_literals -import os -import re -from argparse import ArgumentParser, Namespace, ArgumentError - -from docutils import statemachine, nodes, io, utils -from docutils.parsers import rst -from docutils.core import ErrorString -from docutils.utils import SafeString -import mistune - - -_is_sphinx = False -prolog = '''\ -.. role:: raw-html-m2r(raw) - :format: html - -''' - -# for command-line use -parser = ArgumentParser() -options = Namespace() -parser.add_argument('input_file', nargs='*', - help='files to convert to reST format') -parser.add_argument('--overwrite', action='store_true', default=False, - help='overwrite output file without confirmaion') -parser.add_argument('--dry-run', action='store_true', default=False, - help='print conversion result and not save output file') -parser.add_argument('--no-underscore-emphasis', action='store_true', - default=False, - help='do not use underscore (_) for emphasis') - - -def parse_options(): - parser.parse_known_args(namespace=options) - - -class RestBlockGrammar(mistune.BlockGrammar): - directive = re.compile( - r'^( *\.\..*?)\n(?=\S)', - re.DOTALL | re.MULTILINE, - ) - oneline_directive = re.compile( - r'^( *\.\..*?)$', - re.DOTALL | re.MULTILINE, - ) - rest_code_block = re.compile( - r'^::\s*$', - re.DOTALL | re.MULTILINE, - ) - - -class RestBlockLexer(mistune.BlockLexer): - grammar_class = RestBlockGrammar - default_rules = [ - 'directive', - 'oneline_directive', - 'rest_code_block', - ] + mistune.BlockLexer.default_rules - - def parse_directive(self, m): - self.tokens.append({ - 'type': 'directive', - 'text': m.group(1), - }) - - def parse_oneline_directive(self, m): - # reuse directive output - self.tokens.append({ - 'type': 'directive', - 'text': m.group(1), - }) - - def parse_rest_code_block(self, m): - self.tokens.append({ - 'type': 'rest_code_block', - }) - - -class RestInlineGrammar(mistune.InlineGrammar): - image_link = re.compile( - r'\[!\[(?P.*?)\]\((?P.*?)\).*?\]\((?P.*?)\)' - ) - rest_role = re.compile(r':.*?:`.*?`|`[^`]+`:.*?:') - rest_link = re.compile(r'`[^`]*?`_') - inline_math = re.compile(r'`\$(.*)?\$`') - eol_literal_marker = re.compile(r'(\s+)?::\s*$') - # add colon and space as special text - text = re.compile(r'^[\s\S]+?(?=[\\[\s\S]+?)\1{2}(?!\1)' - ) - # _word_ or *word* - emphasis = re.compile( - r'^\b_((?:__|[^_])+?)_\b' # _word_ - r'|' - r'^\*(?P(?:\*\*|[^\*])+?)\*(?!\*)' # *word* - ) - - def no_underscore_emphasis(self): - self.double_emphasis = re.compile( - r'^\*{2}(?P[\s\S]+?)\*{2}(?!\*)' # **word** - ) - self.emphasis = re.compile( - r'^\*(?P(?:\*\*|[^\*])+?)\*(?!\*)' # *word* - ) - - -class RestInlineLexer(mistune.InlineLexer): - grammar_class = RestInlineGrammar - default_rules = [ - 'image_link', - 'rest_role', - 'rest_link', - 'inline_math', - 'eol_literal_marker', - ] + mistune.InlineLexer.default_rules - - def __init__(self, *args, **kwargs): - no_underscore_emphasis = kwargs.pop('no_underscore_emphasis', False) - super(RestInlineLexer, self).__init__(*args, **kwargs) - if no_underscore_emphasis: - self.rules.no_underscore_emphasis() - elif not _is_sphinx: - parse_options() - if options.no_underscore_emphasis: - self.rules.no_underscore_emphasis() - - def output_double_emphasis(self, m): - # may include code span - text = self.output(m.group('text')) - return self.renderer.double_emphasis(text) - - def output_emphasis(self, m): - # may include code span - text = self.output(m.group('text') or m.group(1)) - return self.renderer.emphasis(text) - - def output_image_link(self, m): - """Pass through rest role.""" - return self.renderer.image_link( - m.group('url'), m.group('target'), m.group('alt')) - - def output_rest_role(self, m): - """Pass through rest role.""" - return self.renderer.rest_role(m.group(0)) - - def output_rest_link(self, m): - """Pass through rest link.""" - return self.renderer.rest_link(m.group(0)) - - def output_inline_math(self, m): - """Pass through rest link.""" - return self.renderer.inline_math(m.group(1)) - - def output_eol_literal_marker(self, m): - """Pass through rest link.""" - marker = ':' if m.group(1) is None else '' - return self.renderer.eol_literal_marker(marker) - - -class RestRenderer(mistune.Renderer): - _include_raw_html = False - list_indent_re = re.compile(r'^(\s*(#\.|\*)\s)') - indent = ' ' * 3 - list_marker = '{#__rest_list_mark__#}' - hmarks = { - 1: '=', - 2: '-', - 3: '^', - 4: '~', - 5: '"', - 6: '#', - } - - def _indent_block(self, block): - return '\n'.join(self.indent + line if line else '' - for line in block.splitlines()) - - def _raw_html(self, html): - self._include_raw_html = True - return '\ :raw-html-m2r:`{}`\ '.format(html) - - def block_code(self, code, lang=None): - if lang == 'math': - first_line = '\n.. math::\n\n' - elif lang: - first_line = '\n.. code-block:: {}\n\n'.format(lang) - elif _is_sphinx: - first_line = '\n.. code-block:: guess\n\n' - else: - first_line = '\n.. code-block::\n\n' - return first_line + self._indent_block(code) + '\n' - - def block_quote(self, text): - # text includes some empty line - return '\n..\n\n{}\n\n'.format(self._indent_block(text.strip('\n'))) - - def block_html(self, html): - """Rendering block level pure html content. - - :param html: text content of the html snippet. - """ - return '\n\n.. raw:: html\n\n' + self._indent_block(html) + '\n\n' - - def header(self, text, level, raw=None): - """Rendering header/heading tags like ``

`` ``

``. - - :param text: rendered text content for the header. - :param level: a number for the header level, for example: 1. - :param raw: raw text content of the header. - """ - return '\n{0}\n{1}\n'.format(text, self.hmarks[level] * len(text)) - - def hrule(self): - """Rendering method for ``
`` tag.""" - return '\n----\n' - - def list(self, body, ordered=True): - """Rendering list tags like ``
    `` and ``
      ``. - - :param body: body contents of the list. - :param ordered: whether this list is ordered or not. - """ - mark = '#. ' if ordered else '* ' - lines = body.splitlines() - for i, line in enumerate(lines): - if line and not line.startswith(self.list_marker): - lines[i] = ' ' * len(mark) + line - return '\n{}\n'.format( - '\n'.join(lines)).replace(self.list_marker, mark) - - def list_item(self, text): - """Rendering list item snippet. Like ``
    1. ``.""" - return '\n' + self.list_marker + text - - def paragraph(self, text): - """Rendering paragraph tags. Like ``

      ``.""" - return '\n' + text + '\n' - - def table(self, header, body): - """Rendering table element. Wrap header and body in it. - - :param header: header part of the table. - :param body: body part of the table. - """ - table = '\n.. list-table::\n' - if header and not header.isspace(): - table = (table + self.indent + ':header-rows: 1\n\n' + - self._indent_block(header) + '\n') - else: - table = table + '\n' - table = table + self._indent_block(body) + '\n\n' - return table - - def table_row(self, content): - """Rendering a table row. Like ````. - - :param content: content of current table row. - """ - contents = content.splitlines() - if not contents: - return '' - clist = ['* ' + contents[0]] - if len(contents) > 1: - for c in contents[1:]: - clist.append(' ' + c) - return '\n'.join(clist) + '\n' - - def table_cell(self, content, **flags): - """Rendering a table cell. Like ```` ````. - - :param content: content of current table cell. - :param header: whether this is header or not. - :param align: align of current table cell. - """ - return '- ' + content + '\n' - - def double_emphasis(self, text): - """Rendering **strong** text. - - :param text: text content for emphasis. - """ - return '\ **{}**\ '.format(text) - - def emphasis(self, text): - """Rendering *emphasis* text. - - :param text: text content for emphasis. - """ - return '\ *{}*\ '.format(text) - - def codespan(self, text): - """Rendering inline `code` text. - - :param text: text content for inline code. - """ - if '``' not in text: - return '\ ``{}``\ '.format(text) - else: - # actually, docutils split spaces in literal - return self._raw_html( - '' - '{}' - ''.format(text.replace('`', '`'))) - - def linebreak(self): - """Rendering line break like ``
      ``.""" - if self.options.get('use_xhtml'): - return self._raw_html('
      ') + '\n' - return self._raw_html('
      ') + '\n' - - def strikethrough(self, text): - """Rendering ~~strikethrough~~ text. - - :param text: text content for strikethrough. - """ - return self._raw_html('{}'.format(text)) - - def text(self, text): - """Rendering unformatted text. - - :param text: text content. - """ - return text - - def autolink(self, link, is_email=False): - """Rendering a given link or email address. - - :param link: link content or email address. - :param is_email: whether this is an email or not. - """ - return link - - def link(self, link, title, text): - """Rendering a given link with content and title. - - :param link: href link for ```` tag. - :param title: title content for `title` attribute. - :param text: text content for description. - """ - if title: - raise NotImplementedError('sorry') - return '\ `{text} <{target}>`_\ '.format(target=link, text=text) - - def image(self, src, title, text): - """Rendering a image with title and text. - - :param src: source link of the image. - :param title: title text of the image. - :param text: alt text of the image. - """ - # rst does not support title option - # and I couldn't find title attribute in HTML standard - return '\n'.join([ - '', - '.. image:: {}'.format(src), - ' :target: {}'.format(src), - ' :alt: {}'.format(text), - '', - ]) - - def inline_html(self, html): - """Rendering span level pure html content. - - :param html: text content of the html snippet. - """ - return self._raw_html(html) - - def newline(self): - """Rendering newline element.""" - return '' - - def footnote_ref(self, key, index): - """Rendering the ref anchor of a footnote. - - :param key: identity key for the footnote. - :param index: the index count of current footnote. - """ - return '\ [#fn-{}]_\ '.format(key) - - def footnote_item(self, key, text): - """Rendering a footnote item. - - :param key: identity key for the footnote. - :param text: text content of the footnote. - """ - return '.. [#fn-{0}] {1}\n'.format(key, text.strip()) - - def footnotes(self, text): - """Wrapper for all footnotes. - - :param text: contents of all footnotes. - """ - if text: - return '\n\n' + text - else: - return '' - - """Below outputs are for rst.""" - def image_link(self, url, target, alt): - return '\n'.join([ - '', - '.. image:: {}'.format(url), - ' :target: {}'.format(target), - ' :alt: {}'.format(alt), - '', - ]) - - def rest_role(self, text): - return text - - def rest_link(self, text): - return text - - def inline_math(self, math): - """Extension of recommonmark""" - return '\ :math:`{}`\ '.format(math) - - def eol_literal_marker(self, marker): - """Extension of recommonmark""" - return marker - - def directive(self, text): - return '\n' + text + '\n' - - def rest_code_block(self): - return '\n\n' - - -class M2R(mistune.Markdown): - def __init__(self, renderer=None, inline=RestInlineLexer, - block=RestBlockLexer, **kwargs): - if renderer is None: - renderer = RestRenderer(**kwargs) - super(M2R, self).__init__(renderer, inline=inline, block=block, - **kwargs) - - def parse(self, text): - output = super(M2R, self).parse(text) - return self.post_process(output) - - def output_directive(self): - return self.renderer.directive(self.token['text']) - - def output_rest_code_block(self): - return self.renderer.rest_code_block() - - def post_process(self, text): - output = (text - .replace('\\ \n', '\n') - .replace('\n\\ ', '\n') - .replace(' \\ ', ' ') - .replace('\\ ', ' ') - .replace('\\ .', '.') - ) - if self.renderer._include_raw_html: - return prolog + output - else: - return output - - -class M2RParser(rst.Parser, object): - def parse(self, inputstring, document): - config = document.settings.env.config - converter = M2R(no_underscore_emphasis=config.no_underscore_emphasis) - super(M2RParser, self).parse(converter(inputstring), document) - - -class MdInclude(rst.Directive): - """Directive class to include markdown in sphinx. - - Load a file and convert it to rst and insert as a node. Currentlly - directive-specific options are not implemented. - """ - required_arguments = 1 - optional_arguments = 0 - - def run(self): - """Most of this method is from ``docutils.parser.rst.Directive``. - - docutils version: 0.12 - """ - if not self.state.document.settings.file_insertion_enabled: - raise self.warning('"%s" directive disabled.' % self.name) - source = self.state_machine.input_lines.source( - self.lineno - self.state_machine.input_offset - 1) - source_dir = os.path.dirname(os.path.abspath(source)) - path = rst.directives.path(self.arguments[0]) - path = os.path.normpath(os.path.join(source_dir, path)) - path = utils.relative_path(None, path) - path = nodes.reprunicode(path) - - # get options (currently not use directive-specific options) - encoding = self.options.get( - 'encoding', self.state.document.settings.input_encoding) - e_handler = self.state.document.settings.input_encoding_error_handler - tab_width = self.options.get( - 'tab-width', self.state.document.settings.tab_width) - - # open the inclding file - try: - self.state.document.settings.record_dependencies.add(path) - include_file = io.FileInput(source_path=path, - encoding=encoding, - error_handler=e_handler) - except UnicodeEncodeError as error: - raise self.severe('Problems with "%s" directive path:\n' - 'Cannot encode input file path "%s" ' - '(wrong locale?).' % - (self.name, SafeString(path))) - except IOError as error: - raise self.severe('Problems with "%s" directive path:\n%s.' % - (self.name, ErrorString(error))) - - # read from the file - try: - rawtext = include_file.read() - except UnicodeError as error: - raise self.severe('Problem with "%s" directive:\n%s' % - (self.name, ErrorString(error))) - - config = self.state.document.settings.env.config - converter = M2R(no_underscore_emphasis=config.no_underscore_emphasis) - include_lines = statemachine.string2lines(converter(rawtext), - tab_width, - convert_whitespace=True) - self.state_machine.insert_input(include_lines, path) - return [] - - -def setup(app): - """When used for spinx extension.""" - global _is_sphinx - _is_sphinx = True - app.add_config_value('no_underscore_emphasis', False, 'env') - app.add_source_parser('.md', M2RParser) - app.add_directive('mdinclude', MdInclude) - - -def parse_from_file(file): - if not os.path.exists(file): - raise OSError('No such file exists: {}'.format(file)) - with open(file) as f: - src = f.read() - output = M2R()(src) - return output - - -def save_to_file(file, src): - target = os.path.splitext(file)[0] + '.rst' - if not options.overwrite and os.path.exists(target): - confirm = input('{} already exists. overwrite it? [y/n]: '.format( - target)) - if confirm.upper() not in ('Y', 'YES'): - print('skip {}'.format(file)) - return - with open(target, 'w') as f: - f.write(src) - - -def main(): - # parse cli options - parse_options() - if not options.input_file: - raise ArgumentError('input files are required') - for file in options.input_file: - output = parse_from_file(file) - if options.dry_run: - print(output) - else: - save_to_file(file, output) - - -if __name__ == '__main__': - main() diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..57aa4a696 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=TF + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/mistune.py b/docs/mistune.py deleted file mode 100644 index 9def15d11..000000000 --- a/docs/mistune.py +++ /dev/null @@ -1,1170 +0,0 @@ -# coding: utf-8 - -# https://github.com/lepture/mistune -# Copyright (c) 2014 - 2015, Hsiaoming Yang -# All rights reserved. -# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -# * Neither the name of the creator nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - -""" - mistune - ~~~~~~~ - - The fastest markdown parser in pure Python with renderer feature. - - :copyright: (c) 2014 - 2017 by Hsiaoming Yang. -""" - -import re -import inspect - -__version__ = '0.7.4' -__author__ = 'Hsiaoming Yang ' -__all__ = [ - 'BlockGrammar', 'BlockLexer', - 'InlineGrammar', 'InlineLexer', - 'Renderer', 'Markdown', - 'markdown', 'escape', -] - - -_key_pattern = re.compile(r'\s+') -_nonalpha_pattern = re.compile(r'\W') -_escape_pattern = re.compile(r'&(?!#?\w+;)') -_newline_pattern = re.compile(r'\r\n|\r') -_block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) -_block_code_leading_pattern = re.compile(r'^ {4}', re.M) -_inline_tags = [ - 'a', 'em', 'strong', 'small', 's', 'cite', 'q', 'dfn', 'abbr', 'data', - 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark', - 'ruby', 'rt', 'rp', 'bdi', 'bdo', 'span', 'br', 'wbr', 'ins', 'del', - 'img', 'font', -] -_pre_tags = ['pre', 'script', 'style'] -_valid_end = r'(?!:/|[^\w\s@]*@)\b' -_valid_attr = r'''\s*[a-zA-Z\-](?:\=(?:"[^"]*"|'[^']*'|[^\s'">]+))?''' -_block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) -_scheme_blacklist = ('javascript:', 'vbscript:') - - -def _pure_pattern(regex): - pattern = regex.pattern - if pattern.startswith('^'): - pattern = pattern[1:] - return pattern - - -def _keyify(key): - return _key_pattern.sub(' ', key.lower()) - - -def escape(text, quote=False, smart_amp=True): - """Replace special characters "&", "<" and ">" to HTML-safe sequences. - - The original cgi.escape will always escape "&", but you can control - this one for a smart escape amp. - - :param quote: if set to True, " and ' will be escaped. - :param smart_amp: if set to False, & will always be escaped. - """ - if smart_amp: - text = _escape_pattern.sub('&', text) - else: - text = text.replace('&', '&') - text = text.replace('<', '<') - text = text.replace('>', '>') - if quote: - text = text.replace('"', '"') - text = text.replace("'", ''') - return text - - -def escape_link(url): - """Remove dangerous URL schemes like javascript: and escape afterwards.""" - lower_url = url.lower().strip('\x00\x1a \n\r\t') - for scheme in _scheme_blacklist: - if lower_url.startswith(scheme): - return '' - return escape(url, quote=True, smart_amp=False) - - -def preprocessing(text, tab=4): - text = _newline_pattern.sub('\n', text) - text = text.expandtabs(tab) - text = text.replace('\u2424', '\n') - pattern = re.compile(r'^ +$', re.M) - return pattern.sub('', text) - - -class BlockGrammar(object): - """Grammars for block level tokens.""" - - def_links = re.compile( - r'^ *\[([^^\]]+)\]: *' # [key]: - r']+)>?' # or link - r'(?: +["(]([^\n]+)[")])? *(?:\n+|$)' - ) - def_footnotes = re.compile( - r'^\[\^([^\]]+)\]: *(' - r'[^\n]*(?:\n+|$)' # [^key]: - r'(?: {1,}[^\n]*(?:\n+|$))*' - r')' - ) - - newline = re.compile(r'^\n+') - block_code = re.compile(r'^( {4}[^\n]+\n*)+') - fences = re.compile( - r'^ *(`{3,}|~{3,}) *(\S+)? *\n' # ```lang - r'([\s\S]+?)\s*' - r'\1 *(?:\n+|$)' # ``` - ) - hrule = re.compile(r'^ {0,3}[-*_](?: *[-*_]){2,} *(?:\n+|$)') - heading = re.compile(r'^ *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)') - lheading = re.compile(r'^([^\n]+)\n *(=|-)+ *(?:\n+|$)') - block_quote = re.compile(r'^( *>[^\n]+(\n[^\n]+)*\n*)+') - list_block = re.compile( - r'^( *)([*+-]|\d+\.) [\s\S]+?' - r'(?:' - r'\n+(?=\1?(?:[-*_] *){3,}(?:\n+|$))' # hrule - r'|\n+(?=%s)' # def links - r'|\n+(?=%s)' # def footnotes - r'|\n{2,}' - r'(?! )' - r'(?!\1(?:[*+-]|\d+\.) )\n*' - r'|' - r'\s*$)' % ( - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - ) - ) - list_item = re.compile( - r'^(( *)(?:[*+-]|\d+\.) [^\n]*' - r'(?:\n(?!\2(?:[*+-]|\d+\.) )[^\n]*)*)', - flags=re.M - ) - list_bullet = re.compile(r'^ *(?:[*+-]|\d+\.) +') - paragraph = re.compile( - r'^((?:[^\n]+\n?(?!' - r'%s|%s|%s|%s|%s|%s|%s|%s|%s' - r'))+)\n*' % ( - _pure_pattern(fences).replace(r'\1', r'\2'), - _pure_pattern(list_block).replace(r'\1', r'\3'), - _pure_pattern(hrule), - _pure_pattern(heading), - _pure_pattern(lheading), - _pure_pattern(block_quote), - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - '<' + _block_tag, - ) - ) - block_html = re.compile( - r'^ *(?:%s|%s|%s) *(?:\n{2,}|\s*$)' % ( - r'', - r'<(%s)((?:%s)*?)>([\s\S]*?)<\/\1>' % (_block_tag, _valid_attr), - r'<%s(?:%s)*?\s*\/?>' % (_block_tag, _valid_attr), - ) - ) - table = re.compile( - r'^ *\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*' - ) - nptable = re.compile( - r'^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*' - ) - text = re.compile(r'^[^\n]+') - - -class BlockLexer(object): - """Block level lexer for block grammars.""" - grammar_class = BlockGrammar - - default_rules = [ - 'newline', 'hrule', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'block_quote', - 'list_block', 'block_html', 'def_links', - 'def_footnotes', 'table', 'paragraph', 'text' - ] - - list_rules = ( - 'newline', 'block_code', 'fences', 'lheading', 'hrule', - 'block_quote', 'list_block', 'block_html', 'text', - ) - - footnote_rules = ( - 'newline', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'hrule', 'block_quote', - 'list_block', 'block_html', 'table', 'paragraph', 'text' - ) - - def __init__(self, rules=None, **kwargs): - self.tokens = [] - self.def_links = {} - self.def_footnotes = {} - - if not rules: - rules = self.grammar_class() - - self.rules = rules - - def __call__(self, text, rules=None): - return self.parse(text, rules) - - def parse(self, text, rules=None): - text = text.rstrip('\n') - - if not rules: - rules = self.default_rules - - def manipulate(text): - for key in rules: - rule = getattr(self.rules, key) - m = rule.match(text) - if not m: - continue - getattr(self, 'parse_%s' % key)(m) - return m - return False # pragma: no cover - - while text: - m = manipulate(text) - if m is not False: - text = text[len(m.group(0)):] - continue - if text: # pragma: no cover - raise RuntimeError('Infinite loop at: %s' % text) - return self.tokens - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append({'type': 'newline'}) - - def parse_block_code(self, m): - # clean leading whitespace - code = _block_code_leading_pattern.sub('', m.group(0)) - self.tokens.append({ - 'type': 'code', - 'lang': None, - 'text': code, - }) - - def parse_fences(self, m): - self.tokens.append({ - 'type': 'code', - 'lang': m.group(2), - 'text': m.group(3), - }) - - def parse_heading(self, m): - self.tokens.append({ - 'type': 'heading', - 'level': len(m.group(1)), - 'text': m.group(2), - }) - - def parse_lheading(self, m): - """Parse setext heading.""" - self.tokens.append({ - 'type': 'heading', - 'level': 1 if m.group(2) == '=' else 2, - 'text': m.group(1), - }) - - def parse_hrule(self, m): - self.tokens.append({'type': 'hrule'}) - - def parse_list_block(self, m): - bull = m.group(2) - self.tokens.append({ - 'type': 'list_start', - 'ordered': '.' in bull, - }) - cap = m.group(0) - self._process_list_item(cap, bull) - self.tokens.append({'type': 'list_end'}) - - def _process_list_item(self, cap, bull): - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determine whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest-1] == '\n' - if not loose: - loose = _next - - if loose: - t = 'loose_item_start' - else: - t = 'list_item_start' - - self.tokens.append({'type': t}) - # recurse - self.parse(item, self.list_rules) - self.tokens.append({'type': 'list_item_end'}) - - def parse_block_quote(self, m): - self.tokens.append({'type': 'block_quote_start'}) - # clean leading > - cap = _block_quote_leading_pattern.sub('', m.group(0)) - self.parse(cap) - self.tokens.append({'type': 'block_quote_end'}) - - def parse_def_links(self, m): - key = _keyify(m.group(1)) - self.def_links[key] = { - 'link': m.group(2), - 'title': m.group(3), - } - - def parse_def_footnotes(self, m): - key = _keyify(m.group(1)) - if key in self.def_footnotes: - # footnote is already defined - return - - self.def_footnotes[key] = 0 - - self.tokens.append({ - 'type': 'footnote_start', - 'key': key, - }) - - text = m.group(2) - - if '\n' in text: - lines = text.split('\n') - whitespace = None - for line in lines[1:]: - space = len(line) - len(line.lstrip()) - if space and (not whitespace or space < whitespace): - whitespace = space - newlines = [lines[0]] - for line in lines[1:]: - newlines.append(line[whitespace:]) - text = '\n'.join(newlines) - - self.parse(text, self.footnote_rules) - - self.tokens.append({ - 'type': 'footnote_end', - 'key': key, - }) - - def parse_table(self, m): - item = self._process_table(m) - - cells = re.sub(r'(?: *\| *)?\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - v = re.sub(r'^ *\| *| *\| *$', '', v) - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def parse_nptable(self, m): - item = self._process_table(m) - - cells = re.sub(r'\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def _process_table(self, m): - header = re.sub(r'^ *| *\| *$', '', m.group(1)) - header = re.split(r' *\| *', header) - align = re.sub(r' *|\| *$', '', m.group(2)) - align = re.split(r' *\| *', align) - - for i, v in enumerate(align): - if re.search(r'^ *-+: *$', v): - align[i] = 'right' - elif re.search(r'^ *:-+: *$', v): - align[i] = 'center' - elif re.search(r'^ *:-+ *$', v): - align[i] = 'left' - else: - align[i] = None - - item = { - 'type': 'table', - 'header': header, - 'align': align, - } - return item - - def parse_block_html(self, m): - tag = m.group(1) - if not tag: - text = m.group(0) - self.tokens.append({ - 'type': 'close_html', - 'text': text - }) - else: - attr = m.group(2) - text = m.group(3) - self.tokens.append({ - 'type': 'open_html', - 'tag': tag, - 'extra': attr, - 'text': text - }) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - self.tokens.append({'type': 'paragraph', 'text': text}) - - def parse_text(self, m): - text = m.group(0) - self.tokens.append({'type': 'text', 'text': text}) - - -class InlineGrammar(object): - """Grammars for inline level tokens.""" - - escape = re.compile(r'^\\([\\`*{}\[\]()#+\-.!_>~|])') # \* \+ \! .... - inline_html = re.compile( - r'^(?:%s|%s|%s)' % ( - r'', - r'<(\w+%s)((?:%s)*?)\s*>([\s\S]*?)<\/\1>' % (_valid_end, _valid_attr), - r'<\w+%s(?:%s)*?\s*\/?>' % (_valid_end, _valid_attr), - ) - ) - autolink = re.compile(r'^<([^ >]+(@|:)[^ >]+)>') - link = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\(' - r'''\s*(<)?([\s\S]*?)(?(2)>)(?:\s+['"]([\s\S]*?)['"])?\s*''' - r'\)' - ) - reflink = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\s*\[([^^\]]*)\]' - ) - nolink = re.compile(r'^!?\[((?:\[[^\]]*\]|[^\[\]])*)\]') - url = re.compile(r'''^(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''') - double_emphasis = re.compile( - r'^_{2}([\s\S]+?)_{2}(?!_)' # __word__ - r'|' - r'^\*{2}([\s\S]+?)\*{2}(?!\*)' # **word** - ) - emphasis = re.compile( - r'^\b_((?:__|[^_])+?)_\b' # _word_ - r'|' - r'^\*((?:\*\*|[^\*])+?)\*(?!\*)' # *word* - ) - code = re.compile(r'^(`+)\s*([\s\S]*?[^`])\s*\1(?!`)') # `code` - linebreak = re.compile(r'^ {2,}\n(?!\s*$)') - strikethrough = re.compile(r'^~~(?=\S)([\s\S]*?\S)~~') # ~~word~~ - footnote = re.compile(r'^\[\^([^\]]+)\]') - text = re.compile(r'^[\s\S]+?(?=[\\%s' % (tag, extra, text, tag) - else: - html = m.group(0) - return self.renderer.inline_html(html) - - def output_footnote(self, m): - key = _keyify(m.group(1)) - if key not in self.footnotes: - return None - if self.footnotes[key]: - return None - self.footnote_index += 1 - self.footnotes[key] = self.footnote_index - return self.renderer.footnote_ref(key, self.footnote_index) - - def output_link(self, m): - return self._process_link(m, m.group(3), m.group(4)) - - def output_reflink(self, m): - key = _keyify(m.group(2) or m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def output_nolink(self, m): - key = _keyify(m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def _process_link(self, m, link, title=None): - line = m.group(0) - text = m.group(1) - if line[0] == '!': - return self.renderer.image(link, title, text) - - self._in_link = True - text = self.output(text) - self._in_link = False - return self.renderer.link(link, title, text) - - def output_double_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.double_emphasis(text) - - def output_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.emphasis(text) - - def output_code(self, m): - text = m.group(2) - return self.renderer.codespan(text) - - def output_linebreak(self, m): - return self.renderer.linebreak() - - def output_strikethrough(self, m): - text = self.output(m.group(1)) - return self.renderer.strikethrough(text) - - def output_text(self, m): - text = m.group(0) - return self.renderer.text(text) - - -class Renderer(object): - """The default HTML renderer for rendering Markdown. - """ - - def __init__(self, **kwargs): - self.options = kwargs - - def placeholder(self): - """Returns the default, empty output value for the renderer. - - All renderer methods use the '+=' operator to append to this value. - Default is a string so rendering HTML can build up a result string with - the rendered Markdown. - - Can be overridden by Renderer subclasses to be types like an empty - list, allowing the renderer to create a tree-like structure to - represent the document (which can then be reprocessed later into a - separate format like docx or pdf). - """ - return '' - - def block_code(self, code, lang=None): - """Rendering block level code. ``pre > code``. - - :param code: text content of the code block. - :param lang: language of the given code. - """ - code = code.rstrip('\n') - if not lang: - code = escape(code, smart_amp=False) - return '

      %s\n
      \n' % code - code = escape(code, quote=True, smart_amp=False) - return '
      %s\n
      \n' % (lang, code) - - def block_quote(self, text): - """Rendering
      with the given text. - - :param text: text content of the blockquote. - """ - return '
      %s\n
      \n' % text.rstrip('\n') - - def block_html(self, html): - """Rendering block level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('skip_style') and \ - html.lower().startswith('`` ``

      ``. - - :param text: rendered text content for the header. - :param level: a number for the header level, for example: 1. - :param raw: raw text content of the header. - """ - return '%s\n' % (level, text, level) - - def hrule(self): - """Rendering method for ``
      `` tag.""" - if self.options.get('use_xhtml'): - return '
      \n' - return '
      \n' - - def list(self, body, ordered=True): - """Rendering list tags like ``
        `` and ``
          ``. - - :param body: body contents of the list. - :param ordered: whether this list is ordered or not. - """ - tag = 'ul' - if ordered: - tag = 'ol' - return '<%s>\n%s\n' % (tag, body, tag) - - def list_item(self, text): - """Rendering list item snippet. Like ``
        1. ``.""" - return '
        2. %s
        3. \n' % text - - def paragraph(self, text): - """Rendering paragraph tags. Like ``

          ``.""" - return '

          %s

          \n' % text.strip(' ') - - def table(self, header, body): - """Rendering table element. Wrap header and body in it. - - :param header: header part of the table. - :param body: body part of the table. - """ - return ( - '\n%s\n' - '\n%s\n
          \n' - ) % (header, body) - - def table_row(self, content): - """Rendering a table row. Like ````. - - :param content: content of current table row. - """ - return '\n%s\n' % content - - def table_cell(self, content, **flags): - """Rendering a table cell. Like ```` ````. - - :param content: content of current table cell. - :param header: whether this is header or not. - :param align: align of current table cell. - """ - if flags['header']: - tag = 'th' - else: - tag = 'td' - align = flags['align'] - if not align: - return '<%s>%s\n' % (tag, content, tag) - return '<%s style="text-align:%s">%s\n' % ( - tag, align, content, tag - ) - - def double_emphasis(self, text): - """Rendering **strong** text. - - :param text: text content for emphasis. - """ - return '%s' % text - - def emphasis(self, text): - """Rendering *emphasis* text. - - :param text: text content for emphasis. - """ - return '%s' % text - - def codespan(self, text): - """Rendering inline `code` text. - - :param text: text content for inline code. - """ - text = escape(text.rstrip(), smart_amp=False) - return '%s' % text - - def linebreak(self): - """Rendering line break like ``
          ``.""" - if self.options.get('use_xhtml'): - return '
          \n' - return '
          \n' - - def strikethrough(self, text): - """Rendering ~~strikethrough~~ text. - - :param text: text content for strikethrough. - """ - return '%s' % text - - def text(self, text): - """Rendering unformatted text. - - :param text: text content. - """ - if self.options.get('parse_block_html'): - return text - return escape(text) - - def escape(self, text): - """Rendering escape sequence. - - :param text: text content. - """ - return escape(text) - - def autolink(self, link, is_email=False): - """Rendering a given link or email address. - - :param link: link content or email address. - :param is_email: whether this is an email or not. - """ - text = link = escape(link) - if is_email: - link = 'mailto:%s' % link - return '
          %s' % (link, text) - - def link(self, link, title, text): - """Rendering a given link with content and title. - - :param link: href link for ```` tag. - :param title: title content for `title` attribute. - :param text: text content for description. - """ - link = escape_link(link) - if not title: - return '%s' % (link, text) - title = escape(title, quote=True) - return '%s' % (link, title, text) - - def image(self, src, title, text): - """Rendering a image with title and text. - - :param src: source link of the image. - :param title: title text of the image. - :param text: alt text of the image. - """ - src = escape_link(src) - text = escape(text, quote=True) - if title: - title = escape(title, quote=True) - html = '%s' % html - return '%s>' % html - - def inline_html(self, html): - """Rendering span level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('escape'): - return escape(html) - return html - - def newline(self): - """Rendering newline element.""" - return '' - - def footnote_ref(self, key, index): - """Rendering the ref anchor of a footnote. - - :param key: identity key for the footnote. - :param index: the index count of current footnote. - """ - html = ( - '' - '%d' - ) % (escape(key), escape(key), index) - return html - - def footnote_item(self, key, text): - """Rendering a footnote item. - - :param key: identity key for the footnote. - :param text: text content of the footnote. - """ - back = ( - '' - ) % escape(key) - text = text.rstrip() - if text.endswith('

          '): - text = re.sub(r'<\/p>$', r'%s

          ' % back, text) - else: - text = '%s

          %s

          ' % (text, back) - html = '
        4. %s
        5. \n' % (escape(key), text) - return html - - def footnotes(self, text): - """Wrapper for all footnotes. - - :param text: contents of all footnotes. - """ - html = '
          \n%s
            %s
          \n
          \n' - return html % (self.hrule(), text) - - -class Markdown(object): - """The Markdown parser. - - :param renderer: An instance of ``Renderer``. - :param inline: An inline lexer class or instance. - :param block: A block lexer class or instance. - """ - def __init__(self, renderer=None, inline=None, block=None, **kwargs): - if not renderer: - renderer = Renderer(**kwargs) - else: - kwargs.update(renderer.options) - - self.renderer = renderer - - if inline and inspect.isclass(inline): - inline = inline(renderer, **kwargs) - if block and inspect.isclass(block): - block = block(**kwargs) - - if inline: - self.inline = inline - else: - self.inline = InlineLexer(renderer, **kwargs) - - self.block = block or BlockLexer(BlockGrammar()) - self.footnotes = [] - self.tokens = [] - - # detect if it should parse text in block html - self._parse_block_html = kwargs.get('parse_block_html') - - def __call__(self, text): - return self.parse(text) - - def render(self, text): - """Render the Markdown text. - - :param text: markdown formatted text content. - """ - return self.parse(text) - - def parse(self, text): - out = self.output(preprocessing(text)) - - keys = self.block.def_footnotes - - # reset block - self.block.def_links = {} - self.block.def_footnotes = {} - - # reset inline - self.inline.links = {} - self.inline.footnotes = {} - - if not self.footnotes: - return out - - footnotes = filter(lambda o: keys.get(o['key']), self.footnotes) - self.footnotes = sorted( - footnotes, key=lambda o: keys.get(o['key']), reverse=True - ) - - body = self.renderer.placeholder() - while self.footnotes: - note = self.footnotes.pop() - body += self.renderer.footnote_item( - note['key'], note['text'] - ) - - out += self.renderer.footnotes(body) - return out - - def pop(self): - if not self.tokens: - return None - self.token = self.tokens.pop() - return self.token - - def peek(self): - if self.tokens: - return self.tokens[-1] - return None # pragma: no cover - - def output(self, text, rules=None): - self.tokens = self.block(text, rules) - self.tokens.reverse() - - self.inline.setup(self.block.def_links, self.block.def_footnotes) - - out = self.renderer.placeholder() - while self.pop(): - out += self.tok() - return out - - def tok(self): - t = self.token['type'] - - # sepcial cases - if t.endswith('_start'): - t = t[:-6] - - return getattr(self, 'output_%s' % t)() - - def tok_text(self): - text = self.token['text'] - while self.peek()['type'] == 'text': - text += '\n' + self.pop()['text'] - return self.inline(text) - - def output_newline(self): - return self.renderer.newline() - - def output_hrule(self): - return self.renderer.hrule() - - def output_heading(self): - return self.renderer.header( - self.inline(self.token['text']), - self.token['level'], - self.token['text'], - ) - - def output_code(self): - return self.renderer.block_code( - self.token['text'], self.token['lang'] - ) - - def output_table(self): - aligns = self.token['align'] - aligns_length = len(aligns) - cell = self.renderer.placeholder() - - # header part - header = self.renderer.placeholder() - for i, value in enumerate(self.token['header']): - align = aligns[i] if i < aligns_length else None - flags = {'header': True, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - - header += self.renderer.table_row(cell) - - # body part - body = self.renderer.placeholder() - for i, row in enumerate(self.token['cells']): - cell = self.renderer.placeholder() - for j, value in enumerate(row): - align = aligns[j] if j < aligns_length else None - flags = {'header': False, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - body += self.renderer.table_row(cell) - - return self.renderer.table(header, body) - - def output_block_quote(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'block_quote_end': - body += self.tok() - return self.renderer.block_quote(body) - - def output_list(self): - ordered = self.token['ordered'] - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_end': - body += self.tok() - return self.renderer.list(body, ordered) - - def output_list_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - if self.token['type'] == 'text': - body += self.tok_text() - else: - body += self.tok() - - return self.renderer.list_item(body) - - def output_loose_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - body += self.tok() - return self.renderer.list_item(body) - - def output_footnote(self): - self.inline._in_footnote = True - body = self.renderer.placeholder() - key = self.token['key'] - while self.pop()['type'] != 'footnote_end': - body += self.tok() - self.footnotes.append({'key': key, 'text': body}) - self.inline._in_footnote = False - return self.renderer.placeholder() - - def output_close_html(self): - text = self.token['text'] - return self.renderer.block_html(text) - - def output_open_html(self): - text = self.token['text'] - tag = self.token['tag'] - if self._parse_block_html and tag not in _pre_tags: - text = self.inline(text, rules=self.inline.inline_html_rules) - extra = self.token.get('extra') or '' - html = '<%s%s>%s' % (tag, extra, text, tag) - return self.renderer.block_html(html) - - def output_paragraph(self): - return self.renderer.paragraph(self.inline(self.token['text'])) - - def output_text(self): - return self.renderer.paragraph(self.tok_text()) - - -def markdown(text, escape=True, **kwargs): - """Render markdown formatted text to html. - - :param text: markdown formatted text content. - :param escape: if set to False, all html tags will not be escaped. - :param use_xhtml: output with xhtml tags. - :param hard_wrap: if set to True, it will use the GFM line breaks feature. - :param parse_block_html: parse text only in block level html. - :param parse_inline_html: parse text only in inline level html. - """ - return Markdown(escape=escape, **kwargs)(text) diff --git a/docs/modules/distributions.rst b/docs/modules/distributions.rst new file mode 100644 index 000000000..b49818950 --- /dev/null +++ b/docs/modules/distributions.rst @@ -0,0 +1,26 @@ +Distributions +============= + +Distributions are customized via the ``distributions`` argument of ``policy``, for instance: + +.. code-block:: python + + Agent.create( + ... + policy=dict(distributions=dict( + float=dict(type='gaussian', stddev_mode='global'), + bounded_action=dict(type='beta') + )) + ... + ) + +See the `policies documentation `_ for more information about how to specify a policy. + + +.. autoclass:: tensorforce.core.distributions.Categorical + +.. autoclass:: tensorforce.core.distributions.Gaussian + +.. autoclass:: tensorforce.core.distributions.Bernoulli + +.. autoclass:: tensorforce.core.distributions.Beta diff --git a/docs/modules/layers.rst b/docs/modules/layers.rst new file mode 100644 index 000000000..d9d819806 --- /dev/null +++ b/docs/modules/layers.rst @@ -0,0 +1,125 @@ +Layers +====== + +See the `networks documentation `_ for more information about how to specify networks. + +Default layer: ``Function`` with default argument ``function``, so a ``lambda`` function is a short-form specification of a simple transformation layer: + +.. code-block:: python + + Agent.create( + ... + policy=dict(network=[ + (lambda x: tf.clip_by_value(x, -1.0, 1.0)), + ... + ]), + ... + ) + + +Dense layers +------------ + +.. autoclass:: tensorforce.core.layers.Dense + +.. autoclass:: tensorforce.core.layers.Linear + + +Convolutional layers +-------------------- + +.. autoclass:: tensorforce.core.layers.Conv1d + +.. autoclass:: tensorforce.core.layers.Conv2d + +.. autoclass:: tensorforce.core.layers.Conv1dTranspose + +.. autoclass:: tensorforce.core.layers.Conv2dTranspose + + +Embedding layers +---------------- + +.. autoclass:: tensorforce.core.layers.Embedding + + +Recurrent layers (unrolled over timesteps) +------------------------------------------ + +.. autoclass:: tensorforce.core.layers.Rnn + +.. autoclass:: tensorforce.core.layers.Lstm + +.. autoclass:: tensorforce.core.layers.Gru + + +Input recurrent layers (unrolled over sequence input) +----------------------------------------------------- + +.. autoclass:: tensorforce.core.layers.InputRnn + +.. autoclass:: tensorforce.core.layers.InputLstm + +.. autoclass:: tensorforce.core.layers.InputGru + + +Pooling layers +-------------- + +.. autoclass:: tensorforce.core.layers.Flatten + +.. autoclass:: tensorforce.core.layers.Pooling + +.. autoclass:: tensorforce.core.layers.Pool1d + +.. autoclass:: tensorforce.core.layers.Pool2d + + +Normalization layers +-------------------- + +.. autoclass:: tensorforce.core.layers.LinearNormalization + +.. autoclass:: tensorforce.core.layers.ExponentialNormalization + +.. autoclass:: tensorforce.core.layers.InstanceNormalization + +.. autoclass:: tensorforce.core.layers.BatchNormalization + + +Misc layers +----------- + +.. autoclass:: tensorforce.core.layers.Reshape + +.. autoclass:: tensorforce.core.layers.Activation + +.. autoclass:: tensorforce.core.layers.Dropout + +.. autoclass:: tensorforce.core.layers.Clipping + +.. autoclass:: tensorforce.core.layers.Image + +.. autoclass:: tensorforce.core.layers.Deltafier + +.. autoclass:: tensorforce.core.layers.Sequence + + +Special layers +-------------- + +.. autoclass:: tensorforce.core.layers.Function + +.. autoclass:: tensorforce.core.layers.Register + +.. autoclass:: tensorforce.core.layers.Retrieve + +.. autoclass:: tensorforce.core.layers.Block + +.. autoclass:: tensorforce.core.layers.Reuse + + +Keras layer +----------- + +.. autoclass:: tensorforce.core.layers.KerasLayer diff --git a/docs/modules/memories.rst b/docs/modules/memories.rst new file mode 100644 index 000000000..6e2333a0a --- /dev/null +++ b/docs/modules/memories.rst @@ -0,0 +1,17 @@ +Memories +======== + +Default memory: ``Replay`` with default argument ``capacity``, so an ``int`` is a short-form specification of a replay memory with corresponding capacity: + +.. code-block:: python + + Agent.create( + ... + memory=10000, + ... + ) + + +.. autoclass:: tensorforce.core.memories.Replay + +.. autoclass:: tensorforce.core.memories.Recent diff --git a/docs/modules/networks.rst b/docs/modules/networks.rst new file mode 100644 index 000000000..d5269700b --- /dev/null +++ b/docs/modules/networks.rst @@ -0,0 +1,86 @@ +Networks +======== + +Default network: ``LayeredNetwork`` with default argument ``layers``, so a ``list`` is a short-form specification of a sequential layer-stack network architecture: + +.. code-block:: python + + Agent.create( + ... + policy=dict(network=[ + dict(type='dense', size=64, activation='tanh'), + dict(type='dense', size=64, activation='tanh') + ]), + ... + ) + +The ``AutoNetwork`` automatically configures a suitable network architecture based on input types and shapes, and offers high-level customization. + +Details about the network layer architecture (policy, baseline, state-preprocessing) can be accessed via ``agent.get_architecture()``. + +Note that the final action/value layer of the policy/baseline network is implicitly added, so the network output can be of arbitrary size and use any activation function, and is only required to be a rank-one embedding vector, or optionally have the same shape as the action in the case of a higher-rank action shape. + +Multi-input and other non-sequential networks are specified as nested list of lists of layers, where each of the inner lists forms a sequential component of the overall network architecture. The following example illustrates how to specify such a more complex network, by using the `special layers `_ ``Register`` and ``Retrieve`` to combine the sequential network components: + +.. code-block:: python + + Agent.create( + states=dict( + observation=dict(type='float', shape=(16, 16, 3), min_value=-1.0, max_value=1.0), + attributes=dict(type='int', shape=(4, 2), num_values=5) + ), + ... + policy=[ + [ + dict(type='retrieve', tensors=['observation']), + dict(type='conv2d', size=32), + dict(type='flatten'), + dict(type='register', tensor='obs-embedding') + ], + [ + dict(type='retrieve', tensors=['attributes']), + dict(type='embedding', size=32), + dict(type='flatten'), + dict(type='register', tensor='attr-embedding') + ], + [ + dict( + type='retrieve', aggregation='concat', + tensors=['obs-embedding', 'attr-embedding'] + ), + dict(type='dense', size=64) + ] + ], + ... + ) + +In the case of multiple action components, some policy types, like `parametrized_distributions`, support the specification of additional network outputs for some/all actions via registered tensors: + +.. code-block:: python + + Agent.create( + ... + actions=dict( + action1=dict(type='int', shape=(), num_values=5), + action2=dict(type='float', shape=(), min_value=-1.0, max_value=1.0) + ), + ... + policy=dict( + type='parametrized_distributions', + network=[ + dict(type='dense', size=64), + dict(type='register', tensor='action1-embedding'), + dict(type='dense', size=64) + # Final output implicitly used for remaining actions + ], + single_output=False + ) + ... + ) + + +.. autoclass:: tensorforce.core.networks.AutoNetwork + +.. autoclass:: tensorforce.core.networks.LayeredNetwork + +.. autoclass:: tensorforce.core.networks.KerasNetwork diff --git a/docs/modules/objectives.rst b/docs/modules/objectives.rst new file mode 100644 index 000000000..8ffec8820 --- /dev/null +++ b/docs/modules/objectives.rst @@ -0,0 +1,11 @@ +Objectives +========== + + +.. autoclass:: tensorforce.core.objectives.PolicyGradient + +.. autoclass:: tensorforce.core.objectives.Value + +.. autoclass:: tensorforce.core.objectives.DeterministicPolicyGradient + +.. autoclass:: tensorforce.core.objectives.Plus diff --git a/docs/modules/optimizers.rst b/docs/modules/optimizers.rst new file mode 100644 index 000000000..a04481b62 --- /dev/null +++ b/docs/modules/optimizers.rst @@ -0,0 +1,39 @@ +Optimizers +========== + +Default optimizer: ``OptimizerWrapper`` which offers additional update modifier options, so instead of using ``TFOptimizer`` directly, a customized Adam optimizer can be specified via: + +.. code-block:: python + + Agent.create( + ... + optimizer=dict( + optimizer='adam', learning_rate=1e-3, clipping_threshold=1e-2, + multi_step=10, subsampling_fraction=64, linesearch_iterations=5, + doublecheck_update=True + ), + ... + ) + + +.. autoclass:: tensorforce.core.optimizers.OptimizerWrapper + +.. autoclass:: tensorforce.core.optimizers.TFOptimizer + +.. autoclass:: tensorforce.core.optimizers.NaturalGradient + +.. autoclass:: tensorforce.core.optimizers.Evolutionary + +.. autoclass:: tensorforce.core.optimizers.ClippingStep + +.. autoclass:: tensorforce.core.optimizers.MultiStep + +.. autoclass:: tensorforce.core.optimizers.DoublecheckStep + +.. autoclass:: tensorforce.core.optimizers.LinesearchStep + +.. autoclass:: tensorforce.core.optimizers.SubsamplingStep + +.. autoclass:: tensorforce.core.optimizers.Synchronization + +.. autoclass:: tensorforce.core.optimizers.Plus diff --git a/docs/modules/parameters.rst b/docs/modules/parameters.rst new file mode 100644 index 000000000..767b1e848 --- /dev/null +++ b/docs/modules/parameters.rst @@ -0,0 +1,55 @@ +Parameters +========== + +Tensorforce distinguishes between agent/module arguments (primitive types: bool/int/float) which either specify part of the TensorFlow model architecture, like the layer size, or a value within the architecture, like the learning rate. Whereas the former are statically defined as part of the agent initialization, the latter can be dynamically adjusted afterwards. These dynamic hyperparameter are indicated by ``parameter`` as part of their argument type specification in the documentation, and can alternatively be assigned a parameter module instead of a constant value, for instance, to specify a decaying learning rate. + +Default parameter: ``Constant``, so a ``bool``/``int``/``float`` value is a short-form specification of a constant (dynamic) parameter: + +.. code-block:: python + + Agent.create( + ... + exploration=0.1, + ... + ) + +Example of how to specify an exponentially decaying learning rate: + +.. code-block:: python + + Agent.create( + ... + optimizer=dict(optimizer='adam', learning_rate=dict( + type='exponential', unit='timesteps', num_steps=1000, + initial_value=0.01, decay_rate=0.5 + )), + ... + ) + +Example of how to specify a linearly increasing reward horizon: + +.. code-block:: python + + Agent.create( + ... + reward_estimation=dict(horizon=dict( + type='linear', unit='episodes', num_steps=1000, + initial_value=10, final_value=50 + )), + ... + ) + + +.. autoclass:: tensorforce.core.parameters.Constant + +.. autoclass:: tensorforce.core.parameters.Linear + +.. autoclass:: tensorforce.core.parameters.PiecewiseConstant + +.. autoclass:: tensorforce.core.parameters.Exponential + +.. autoclass:: tensorforce.core.parameters.Decaying + +.. autoclass:: tensorforce.core.parameters.OrnsteinUhlenbeck + +.. autoclass:: tensorforce.core.parameters.Random diff --git a/docs/modules/policies.rst b/docs/modules/policies.rst new file mode 100644 index 000000000..c20655fda --- /dev/null +++ b/docs/modules/policies.rst @@ -0,0 +1,84 @@ +Policies +======== + +Default policy: depends on agent configuration, but always with default argument ``network`` (with default argument ``layers``), so a ``list`` is a short-form specification of a sequential layer-stack network architecture: + +.. code-block:: python + + Agent.create( + ... + policy=[ + dict(type='dense', size=64, activation='tanh'), + dict(type='dense', size=64, activation='tanh') + ], + ... + ) + +Or simply: + +.. code-block:: python + + Agent.create( + ... + policy=dict(network='auto'), + ... + ) + +See the `networks documentation `_ for more information about how to specify a network. + +Example of a full parametrized-distributions policy specification with customized distribution and decaying temperature: + +.. code-block:: python + + Agent.create( + ... + policy=dict( + type='parametrized_distributions', + network=[ + dict(type='dense', size=64, activation='tanh'), + dict(type='dense', size=64, activation='tanh') + ], + distributions=dict( + float=dict(type='gaussian', stddev_mode='global'), + bounded_action=dict(type='beta') + ), + temperature=dict( + type='decaying', decay='exponential', unit='episodes', + num_steps=100, initial_value=0.01, decay_rate=0.5 + ) + ) + ... + ) + +In the case of multiple action components, some policy types, like `parametrized_distributions`, support the specification of additional network outputs for some/all actions via registered tensors: + +.. code-block:: python + + Agent.create( + ... + actions=dict( + action1=dict(type='int', shape=(), num_values=5), + action2=dict(type='float', shape=(), min_value=-1.0, max_value=1.0) + ), + ... + policy=dict( + type='parametrized_distributions', + network=[ + dict(type='dense', size=64), + dict(type='register', tensor='action1-embedding'), + dict(type='dense', size=64) + # Final output implicitly used for remaining actions + ], + single_output=False + ) + ... + ) + + +.. autoclass:: tensorforce.core.policies.ParametrizedActionValue + +.. autoclass:: tensorforce.core.policies.ParametrizedDistributions + +.. autoclass:: tensorforce.core.policies.ParametrizedStateValue + +.. autoclass:: tensorforce.core.policies.ParametrizedValuePolicy diff --git a/docs/modules/preprocessing.rst b/docs/modules/preprocessing.rst new file mode 100644 index 000000000..f17bad5e5 --- /dev/null +++ b/docs/modules/preprocessing.rst @@ -0,0 +1,47 @@ +Preprocessing +============= + +Example of how to specify state and reward preprocessing: + +.. code-block:: python + + Agent.create( + ... + reward_estimation=dict( + ... + reward_processing=dict(type='clipping', lower=-1.0, upper=1.0) + ), + state_preprocessing=[ + dict(type='image', height=4, width=4, grayscale=True), + dict(type='exponential_normalization', decay=0.999) + ], + ... + ) + + +.. autoclass:: tensorforce.core.layers.Clipping + :noindex: + +.. autoclass:: tensorforce.core.layers.Image + :noindex: + +.. autoclass:: tensorforce.core.layers.LinearNormalization + :noindex: + +.. autoclass:: tensorforce.core.layers.ExponentialNormalization + :noindex: + +.. autoclass:: tensorforce.core.layers.InstanceNormalization + :noindex: + +.. autoclass:: tensorforce.core.layers.Deltafier + :noindex: + +.. autoclass:: tensorforce.core.layers.Sequence + :noindex: + +.. autoclass:: tensorforce.core.layers.Activation + :noindex: + +.. autoclass:: tensorforce.core.layers.Dropout + :noindex: diff --git a/docs/preprocessing.md b/docs/preprocessing.md deleted file mode 100644 index 9d5adb8ee..000000000 --- a/docs/preprocessing.md +++ /dev/null @@ -1,160 +0,0 @@ -Preprocessing -============= - -Often it is necessary to modify state input tensors before passing them -to the reinforcement learning agent. This could be due to various -reasons, e.g.: - -- Feature scaling / input normalization, -- Data reduction, -- Ensuring the Markov property by concatenating multiple states (e.g. - in Atari) - -TensorForce comes with a number of ready-to-use preprocessors, a -preprocessing stack and easy ways to implement your own preprocessors. - -Usage ------ - -The - -Each preprocessor implements three methods: - -1. The constructor (`__init__`) for parameter initialization -2. `process(state)` takes a state and returns the processed state -3. `processed_shape(original_shape)` takes a shape and returns the processed - shape - -The preprocessing stack iteratively calls these functions of all -preprocessors in the stack and returns the result. - -### Using one preprocessor - -```python -from tensorforce.core.preprocessing import Sequence - -pp_seq = Sequence(4) # initialize preprocessor (return sequence of last 4 states) - -state = env.reset() # reset environment -processed_state = pp_seq.process(state) # process state -``` - -### Using a preprocessing stack - -You can stack multipe preprocessors: - -```python -from tensorforce.core.preprocessing import Preprocessing, Grayscale, Sequence - -pp_gray = Grayscale() # initialize grayscale preprocessor -pp_seq = Sequence(4) # initialize sequence preprocessor - -stack = Preprocessing() # initialize preprocessing stack -stack.add(pp_gray) # add grayscale preprocessor to stack -stack.add(pp_seq) # add maximum preprocessor to stack - -state = env.reset() # reset environment -processed_state = stack.process(state) # process state -``` - -### Using a configuration dict - -If you use configuration objects, you can build your preprocessing stack -from a config: - -```python -from tensorforce.core.preprocessing import Preprocessing - -preprocessing_config = [ - { - "type": "image_resize", - "width": 84, - "height": 84 - }, { - "type": "grayscale" - }, { - "type": "center" - }, { - "type": "sequence", - "length": 4 - } -] - -stack = Preprocessing.from_spec(preprocessing_config) -config.state_shape = stack.shape(config.state_shape) -``` - -The `Agent` class expects a *preprocessing* configuration parameter and then -handles preprocessing automatically: - -``` -from tensorforce.agents import DQNAgent - -agent = DQNAgent(config=dict( - states=..., - actions=..., - preprocessing=preprocessing_config, - # ... -)) -``` - - -Ready-to-use preprocessors --------------------------- - -These are the preprocessors that come with TensorForce: - -### Standardize - -```eval_rst - .. autoclass:: tensorforce.core.preprocessing.Standardize - :noindex: - :show-inheritance: - :members: -``` - -### Grayscale - -```eval_rst - .. autoclass:: tensorforce.core.preprocessing.Grayscale - :noindex: - :show-inheritance: - :members: -``` - -### ImageResize - -```eval_rst - .. autoclass:: tensorforce.core.preprocessing.ImageResize - :noindex: - :show-inheritance: - :members: -``` - -### Normalize - -```eval_rst - .. autoclass:: tensorforce.core.preprocessing.Normalize - :noindex: - :show-inheritance: - :members: -``` - -### Sequence - -```eval_rst - .. autoclass:: tensorforce.core.preprocessing.Sequence - :noindex: - :show-inheritance: - :members: -``` - -Building your own preprocessor ------------------------------- - -All preprocessors should inherit from -`tensorforce.core.preprocessing.Preprocessor`. - -For a start, please refer to the source of the [Grayscale -preprocessor](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/core/preprocessing/grayscale.py). - diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..45ed003a0 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,8 @@ +m2r +mistune == 2.0.3 # temporary +msgpack +msgpack-numpy +recommonmark +sphinx +sphinx-rtd-theme +tensorflow diff --git a/docs/runner.md b/docs/runner.md deleted file mode 100644 index 4071c3eee..000000000 --- a/docs/runner.md +++ /dev/null @@ -1,143 +0,0 @@ -Runners -======= - -A "runner" manages the interaction between the Environment and the -Agent. TensorForce comes with ready-to-use runners. Of course, you can -implement your own runners, too. If you are not using simulation -environments, the runner is simply your application code using the Agent -API. - -> Environment <-> Runner <-> Agent <-> Model - -Ready-to-use runners --------------------- - -We implemented a standard runner, a threaded runner (for real-time -interaction e.g. with OpenAI Universe) and a distributed runner for A3C -variants. - -### Runner - -This is the standard runner. It requires an agent and an environment for -initialization: - -```python -from tensorforce.execution import Runner - -runner = Runner( - agent = agent, # Agent object - environment = env # Environment object -) -``` - -A reinforcement learning agent observes states from the environment, -selects actions and collect experience which is used to update its model -and improve action selection. You can get information about our -ready-to-use agents [here](agents_models.html). - -The environment object is either the "real" environment, or a proxy -which fulfills the actions selected by the agent in the real world. You -can find information about environments [here](environments.html). - -The runner is started with the `Runner.run(...)` method: - -```python -runner.run( - episodes = int, # number of episodes to run - max_timesteps = int, # maximum timesteps per episode - episode_finished = object, # callback function called when episode is finished -) -runner.close() -``` - -You can use the episode\_finished callback for printing performance -feedback: - -```python -def episode_finished(r): - if r.episode % 10 == 0: - print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) - print("Episode reward: {}".format(r.episode_rewards[-1])) - print("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[-10:]))) - return True -``` - -#### Using the Runner - -Here is some example code for using the runner (without preprocessing). - -```python -import logging - -from tensorforce.contrib.openai_gym import OpenAIGym -from tensorforce.agents import DQNAgent -from tensorforce.execution import Runner - -def main(): - gym_id = 'CartPole-v0' - max_episodes = 10000 - max_timesteps = 1000 - - env = OpenAIGym(gym_id) - network_spec = [ - dict(type='dense', size=32, activation='tanh'), - dict(type='dense', size=32, activation='tanh') - ] - - agent = DQNAgent( - states_spec=env.states, - actions_spec=env.actions, - network_spec=network_spec, - batch_size=64 - ) - - runner = Runner(agent, env) - - report_episodes = 10 - - def episode_finished(r): - if r.episode % report_episodes == 0: - logging.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) - logging.info("Episode reward: {}".format(r.episode_rewards[-1])) - logging.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) - return True - - print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) - - runner.run(max_episodes, max_timesteps, episode_finished=episode_finished) - runner.close() - - print("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) - -if __name__ == '__main__': - main() -``` - - -Building your own runner ------------------------- - -There are three mandatory tasks any runner implements: Obtaining an -action from the agent, passing it to the environment, and passing the -resulting observation to the agent. - -```python -# Get action -action = agent.act(state) - -# Execute action in the environment -state, reward, terminal_state = environment.execute(action) - -# Pass observation to the agent -agent.observe(state, action, reward, terminal_state) -``` - -The key idea here is the separation of concerns. External code should -not need to manage batches or remember network features, this is that -the agent is for. Conversely, an agent need not concern itself with how -a model is implemented and the API should facilitate easy combination of -different agents and models. - -If you would like to build your own runner, it is probably a good idea -to take a look at the [source code of our Runner -class](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/execution/runner.py). diff --git a/docs/summary_spec.md b/docs/summary_spec.md deleted file mode 100644 index b4f68c959..000000000 --- a/docs/summary_spec.md +++ /dev/null @@ -1,132 +0,0 @@ -TensorForce: Details for "summary_spec" agent parameters -==================================================================== - -[![Docs](https://readthedocs.org/projects/tensorforce/badge)](http://tensorforce.readthedocs.io/en/latest/) -[![Gitter](https://badges.gitter.im/reinforceio/TensorForce.svg)](https://docs.google.com/forms/d/1_UD5Pb5LaPVUviD0pO0fFcEnx_vwenvuc00jmP2rRIc/) -[![Build Status](https://travis-ci.org/reinforceio/tensorforce.svg?branch=master)](https://travis-ci.org/reinforceio/tensorforce) -[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/reinforceio/tensorforce/blob/master/LICENSE) - -summary_spec ------------- - -TensorForce has the ability to record summary data for use with TensorBoard -as well STDIO and file export. This is accomplished through dictionary -parameter called "summary_spec" passed to the agent on initialization. - -"summary_spec" supports the following optional dictionary entries: - -```eval_rst -+--------------+------------------------------------------------------------+ -| Key | Value | -+==============+============================================================+ -| directory | (str) Path to storage for TensorBoard summary data | -+--------------+------------------------------------------------------------+ -| steps | (int) Frequency in steps between storage of summary data | -+--------------+------------------------------------------------------------+ -| seconds | (int) Frequency in seconds to store summary data | -+--------------+------------------------------------------------------------+ -| labels | (list) Requested Export, See "*LABELS*" section | -+--------------+------------------------------------------------------------+ -| meta\_dict | (dict) For used with label "configuration" | -+--------------+------------------------------------------------------------+ -``` - - -LABELS ------- -```eval_rst - -+------------------------+---------------------------------------------------------+ -| Entry | Data produced | -+========================+=========================================================+ -| losses | Training total-loss and "loss-without-regularization" | -+------------------------+---------------------------------------------------------+ -| total-loss | Final calculated loss value | -+------------------------+---------------------------------------------------------+ -| variables | Network variables | -+------------------------+---------------------------------------------------------+ -| inputs | Equivalent to: ['states', 'actions', 'rewards'] | -+------------------------+---------------------------------------------------------+ -| states | Histogram of input state space | -+------------------------+---------------------------------------------------------+ -| actions | Histogram of input action space | -+------------------------+---------------------------------------------------------+ -| rewards | Histogram of input reward space | -+------------------------+---------------------------------------------------------+ -| gradients | Histogram and scalar gradients | -+------------------------+---------------------------------------------------------+ -| gradients\_histogram | Variable gradients as histograms | -+------------------------+---------------------------------------------------------+ -| gradients\_scalar | Variable Mean/Variance of gradients as scalar | -+------------------------+---------------------------------------------------------+ -| regularization | Regularization values | -+------------------------+---------------------------------------------------------+ -| **configuration** | See *Configuration Export* for more detail | -+------------------------+---------------------------------------------------------+ -| configuration | Export configuration to "TEXT" tab in TensorBoard | -+------------------------+---------------------------------------------------------+ -| print\_configuration | Prints configuration to STDOUT | -+------------------------+---------------------------------------------------------+ -``` - -```python -from tensorforce.agents import PPOAgent - -# Create a Proximal Policy Optimization agent -agent = PPOAgent( - states_spec=..., - actions_spec=..., - network_spec=..., - summary_spec=dict(directory="./board/", - steps=50, - labels=['configuration', - 'gradients_scalar', - 'regularization', - 'inputs', - 'losses', - 'variables'] - ), - ... -) -``` - -Configuration Export --------------------- - -Adding the "configuration" label will create a "TEXT" tab in TensorBoard -that contains all the parameters passed to the Agent. By using the additional -"summary_spec" dictionary key "meta_dict", custom keys and values can be added -to the data export. The user may want to pass "Description", "Experiement #", - "InputDataSet", etc. - -If a key is already in use within TensorForce an error will be raised to -notify you to change the key value. To use the custom feature, create a -dictionary with keys to export: -```python -from tensorforce.agents import PPOAgent - -metaparams['MyDescription'] = "This experiment covers the first test ...." -metaparams['My2D'] = np.ones((9,9)) # 9x9 matrix of 1.0's -metaparams['My1D'] = np.ones((9)) # Column of 9 1.0's - -# Create a Proximal Policy Optimization agent -agent = PPOAgent( - states_spec=..., - actions_spec=..., - network_spec=..., - summary_spec=dict(directory="./board/", - steps=50, - meta_dict=metaparams, #Add custom keys to export - labels=['configuration', - 'gradients_scalar', - 'regularization', - 'inputs', - 'losses', - 'variables'] - ), - ... -) -``` - -Use the "print_configuration" label to export the configuration data to the -command line's STDOUT. diff --git a/docs/tensorforce/tensorforce.agents.rst b/docs/tensorforce/tensorforce.agents.rst deleted file mode 100644 index 95fafc3b7..000000000 --- a/docs/tensorforce/tensorforce.agents.rst +++ /dev/null @@ -1,141 +0,0 @@ -tensorforce\.agents package -=========================== - -Submodules ----------- - -tensorforce\.agents\.agent module ---------------------------------- - -.. automodule:: tensorforce.agents.agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.batch\_agent module ----------------------------------------- - -.. automodule:: tensorforce.agents.batch_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.constant\_agent module -------------------------------------------- - -.. automodule:: tensorforce.agents.constant_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.ddqn\_agent module ---------------------------------------- - -.. automodule:: tensorforce.agents.ddqn_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.dqfd\_agent module ---------------------------------------- - -.. automodule:: tensorforce.agents.dqfd_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.dqn\_agent module --------------------------------------- - -.. automodule:: tensorforce.agents.dqn_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.dqn\_nstep\_agent module ---------------------------------------------- - -.. automodule:: tensorforce.agents.dqn_nstep_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.learning\_agent module -------------------------------------------- - -.. automodule:: tensorforce.agents.learning_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.memory\_agent module ------------------------------------------ - -.. automodule:: tensorforce.agents.memory_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.naf\_agent module --------------------------------------- - -.. automodule:: tensorforce.agents.naf_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.ppo\_agent module --------------------------------------- - -.. automodule:: tensorforce.agents.ppo_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.random\_agent module ------------------------------------------ - -.. automodule:: tensorforce.agents.random_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.trpo\_agent module ---------------------------------------- - -.. automodule:: tensorforce.agents.trpo_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.agents\.vpg\_agent module --------------------------------------- - -.. automodule:: tensorforce.agents.vpg_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.agents - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.contrib.rst b/docs/tensorforce/tensorforce.contrib.rst deleted file mode 100644 index 67920ade3..000000000 --- a/docs/tensorforce/tensorforce.contrib.rst +++ /dev/null @@ -1,87 +0,0 @@ -tensorforce\.contrib package -============================ - -Submodules ----------- - -tensorforce\.contrib\.ale module --------------------------------- - -.. automodule:: tensorforce.contrib.ale - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.deepmind\_lab module ------------------------------------------- - -.. automodule:: tensorforce.contrib.deepmind_lab - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.maze\_explorer module -------------------------------------------- - -.. automodule:: tensorforce.contrib.maze_explorer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.openai\_gym module ----------------------------------------- - -.. automodule:: tensorforce.contrib.openai_gym - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.openai\_universe module ---------------------------------------------- - -.. automodule:: tensorforce.contrib.openai_universe - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.remote\_environment module ------------------------------------------------- - -.. automodule:: tensorforce.contrib.remote_environment - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.state\_settable\_environment module ---------------------------------------------------------- - -.. automodule:: tensorforce.contrib.state_settable_environment - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.contrib\.unreal\_engine module -------------------------------------------- - -.. automodule:: tensorforce.contrib.unreal_engine - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.contrib - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.baselines.rst b/docs/tensorforce/tensorforce.core.baselines.rst deleted file mode 100644 index 181185bd6..000000000 --- a/docs/tensorforce/tensorforce.core.baselines.rst +++ /dev/null @@ -1,60 +0,0 @@ -tensorforce\.core\.baselines package -==================================== - -Submodules ----------- - -tensorforce\.core\.baselines\.aggregated\_baseline module ---------------------------------------------------------- - -.. automodule:: tensorforce.core.baselines.aggregated_baseline - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.baselines\.baseline module ---------------------------------------------- - -.. automodule:: tensorforce.core.baselines.baseline - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.baselines\.cnn\_baseline module --------------------------------------------------- - -.. automodule:: tensorforce.core.baselines.cnn_baseline - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.baselines\.mlp\_baseline module --------------------------------------------------- - -.. automodule:: tensorforce.core.baselines.mlp_baseline - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.baselines\.network\_baseline module ------------------------------------------------------- - -.. automodule:: tensorforce.core.baselines.network_baseline - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.baselines - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.distributions.rst b/docs/tensorforce/tensorforce.core.distributions.rst deleted file mode 100644 index 4f2d23dce..000000000 --- a/docs/tensorforce/tensorforce.core.distributions.rst +++ /dev/null @@ -1,60 +0,0 @@ -tensorforce\.core\.distributions package -======================================== - -Submodules ----------- - -tensorforce\.core\.distributions\.bernoulli module --------------------------------------------------- - -.. automodule:: tensorforce.core.distributions.bernoulli - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.distributions\.beta module ---------------------------------------------- - -.. automodule:: tensorforce.core.distributions.beta - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.distributions\.categorical module ----------------------------------------------------- - -.. automodule:: tensorforce.core.distributions.categorical - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.distributions\.distribution module ------------------------------------------------------ - -.. automodule:: tensorforce.core.distributions.distribution - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.distributions\.gaussian module -------------------------------------------------- - -.. automodule:: tensorforce.core.distributions.gaussian - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.distributions - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.explorations.rst b/docs/tensorforce/tensorforce.core.explorations.rst deleted file mode 100644 index 13aee237d..000000000 --- a/docs/tensorforce/tensorforce.core.explorations.rst +++ /dev/null @@ -1,69 +0,0 @@ -tensorforce\.core\.explorations package -======================================= - -Submodules ----------- - -tensorforce\.core\.explorations\.constant module ------------------------------------------------- - -.. automodule:: tensorforce.core.explorations.constant - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.explorations\.epsilon\_anneal module -------------------------------------------------------- - -.. automodule:: tensorforce.core.explorations.epsilon_anneal - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.explorations\.epsilon\_decay module ------------------------------------------------------- - -.. automodule:: tensorforce.core.explorations.epsilon_decay - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.explorations\.exploration module ---------------------------------------------------- - -.. automodule:: tensorforce.core.explorations.exploration - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.explorations\.linear\_decay module ------------------------------------------------------ - -.. automodule:: tensorforce.core.explorations.linear_decay - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.explorations\.ornstein\_uhlenbeck\_process module --------------------------------------------------------------------- - -.. automodule:: tensorforce.core.explorations.ornstein_uhlenbeck_process - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.explorations - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.memories.rst b/docs/tensorforce/tensorforce.core.memories.rst deleted file mode 100644 index c29334449..000000000 --- a/docs/tensorforce/tensorforce.core.memories.rst +++ /dev/null @@ -1,51 +0,0 @@ -tensorforce\.core\.memories package -=================================== - -Submodules ----------- - -tensorforce\.core\.memories\.memory module ------------------------------------------- - -.. automodule:: tensorforce.core.memories.memory - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.memories\.naive\_prioritized\_replay module --------------------------------------------------------------- - -.. automodule:: tensorforce.core.memories.naive_prioritized_replay - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.memories\.prioritized\_replay module -------------------------------------------------------- - -.. automodule:: tensorforce.core.memories.prioritized_replay - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.memories\.replay module ------------------------------------------- - -.. automodule:: tensorforce.core.memories.replay - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.memories - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.networks.rst b/docs/tensorforce/tensorforce.core.networks.rst deleted file mode 100644 index 5b0a2106e..000000000 --- a/docs/tensorforce/tensorforce.core.networks.rst +++ /dev/null @@ -1,42 +0,0 @@ -tensorforce\.core\.networks package -=================================== - -Submodules ----------- - -tensorforce\.core\.networks\.complex\_network module ----------------------------------------------------- - -.. automodule:: tensorforce.core.networks.complex_network - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.networks\.layer module ------------------------------------------ - -.. automodule:: tensorforce.core.networks.layer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.networks\.network module -------------------------------------------- - -.. automodule:: tensorforce.core.networks.network - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.networks - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.optimizers.rst b/docs/tensorforce/tensorforce.core.optimizers.rst deleted file mode 100644 index 1c337d2f5..000000000 --- a/docs/tensorforce/tensorforce.core.optimizers.rst +++ /dev/null @@ -1,112 +0,0 @@ -tensorforce\.core\.optimizers package -===================================== - -Subpackages ------------ - -.. toctree:: - - tensorforce.core.optimizers.solvers - -Submodules ----------- - -tensorforce\.core\.optimizers\.clipped\_step module ---------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.clipped_step - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.evolutionary module --------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.evolutionary - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.global\_optimizer module -------------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.global_optimizer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.meta\_optimizer module ------------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.meta_optimizer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.multi\_step module -------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.multi_step - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.natural\_gradient module -------------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.natural_gradient - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.optimized\_step module ------------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.optimized_step - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.optimizer module ------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.optimizer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.synchronization module ------------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.synchronization - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.tf\_optimizer module ---------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.tf_optimizer - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.optimizers - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.optimizers.solvers.rst b/docs/tensorforce/tensorforce.core.optimizers.solvers.rst deleted file mode 100644 index dbe274f12..000000000 --- a/docs/tensorforce/tensorforce.core.optimizers.solvers.rst +++ /dev/null @@ -1,51 +0,0 @@ -tensorforce\.core\.optimizers\.solvers package -============================================== - -Submodules ----------- - -tensorforce\.core\.optimizers\.solvers\.conjugate\_gradient module ------------------------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.solvers.conjugate_gradient - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.solvers\.iterative module --------------------------------------------------------- - -.. automodule:: tensorforce.core.optimizers.solvers.iterative - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.solvers\.line\_search module ------------------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.solvers.line_search - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.optimizers\.solvers\.solver module ------------------------------------------------------ - -.. automodule:: tensorforce.core.optimizers.solvers.solver - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.optimizers.solvers - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.preprocessing.rst b/docs/tensorforce/tensorforce.core.preprocessing.rst deleted file mode 100644 index d20e7fc92..000000000 --- a/docs/tensorforce/tensorforce.core.preprocessing.rst +++ /dev/null @@ -1,105 +0,0 @@ -tensorforce\.core\.preprocessing package -======================================== - -Submodules ----------- - -tensorforce\.core\.preprocessing\.clip module ---------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.clip - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.divide module ------------------------------------------------ - -.. automodule:: tensorforce.core.preprocessing.divide - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.grayscale module --------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.grayscale - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.image\_resize module ------------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.image_resize - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.normalize module --------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.normalize - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.preprocessor module ------------------------------------------------------ - -.. automodule:: tensorforce.core.preprocessing.preprocessor - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.preprocessor\_stack module ------------------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.preprocessor_stack - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.running\_standardize module -------------------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.running_standardize - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.sequence module -------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.sequence - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.core\.preprocessing\.standardize module ----------------------------------------------------- - -.. automodule:: tensorforce.core.preprocessing.standardize - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.core.preprocessing - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.core.rst b/docs/tensorforce/tensorforce.core.rst deleted file mode 100644 index 8bd5d1e62..000000000 --- a/docs/tensorforce/tensorforce.core.rst +++ /dev/null @@ -1,24 +0,0 @@ -tensorforce\.core package -========================= - -Subpackages ------------ - -.. toctree:: - - tensorforce.core.baselines - tensorforce.core.distributions - tensorforce.core.explorations - tensorforce.core.memories - tensorforce.core.networks - tensorforce.core.optimizers - tensorforce.core.preprocessing - -Module contents ---------------- - -.. automodule:: tensorforce.core - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.environments.rst b/docs/tensorforce/tensorforce.environments.rst deleted file mode 100644 index dbcf7f469..000000000 --- a/docs/tensorforce/tensorforce.environments.rst +++ /dev/null @@ -1,33 +0,0 @@ -tensorforce\.environments package -================================= - -Submodules ----------- - -tensorforce\.environments\.environment module ---------------------------------------------- - -.. automodule:: tensorforce.environments.environment - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.environments\.minimal\_test module ------------------------------------------------ - -.. automodule:: tensorforce.environments.minimal_test - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.environments - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.execution.rst b/docs/tensorforce/tensorforce.execution.rst deleted file mode 100644 index 0669519e3..000000000 --- a/docs/tensorforce/tensorforce.execution.rst +++ /dev/null @@ -1,33 +0,0 @@ -tensorforce\.execution package -============================== - -Submodules ----------- - -tensorforce\.execution\.runner module -------------------------------------- - -.. automodule:: tensorforce.execution.runner - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.execution\.threaded\_runner module ------------------------------------------------ - -.. automodule:: tensorforce.execution.threaded_runner - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.execution - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.models.rst b/docs/tensorforce/tensorforce.models.rst deleted file mode 100644 index 1f66a43ad..000000000 --- a/docs/tensorforce/tensorforce.models.rst +++ /dev/null @@ -1,114 +0,0 @@ -tensorforce\.models package -=========================== - -Submodules ----------- - -tensorforce\.models\.constant\_model module -------------------------------------------- - -.. automodule:: tensorforce.models.constant_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.distribution\_model module ------------------------------------------------ - -.. automodule:: tensorforce.models.distribution_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.model module ---------------------------------- - -.. automodule:: tensorforce.models.model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.pg\_log\_prob\_model module ------------------------------------------------- - -.. automodule:: tensorforce.models.pg_log_prob_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.pg\_model module -------------------------------------- - -.. automodule:: tensorforce.models.pg_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.pg\_prob\_ratio\_model module --------------------------------------------------- - -.. automodule:: tensorforce.models.pg_prob_ratio_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.q\_demo\_model module ------------------------------------------- - -.. automodule:: tensorforce.models.q_demo_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.q\_model module ------------------------------------- - -.. automodule:: tensorforce.models.q_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.q\_naf\_model module ------------------------------------------ - -.. automodule:: tensorforce.models.q_naf_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.q\_nstep\_model module -------------------------------------------- - -.. automodule:: tensorforce.models.q_nstep_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.models\.random\_model module ------------------------------------------ - -.. automodule:: tensorforce.models.random_model - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.models - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.rst b/docs/tensorforce/tensorforce.rst deleted file mode 100644 index 12a781aff..000000000 --- a/docs/tensorforce/tensorforce.rst +++ /dev/null @@ -1,55 +0,0 @@ -tensorforce package -=================== - -Subpackages ------------ - -.. toctree:: - - tensorforce.agents - tensorforce.contrib - tensorforce.core - tensorforce.environments - tensorforce.execution - tensorforce.models - tensorforce.tests - -Submodules ----------- - -tensorforce\.exception module ------------------------------ - -.. automodule:: tensorforce.exception - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.meta\_parameter\_recorder module ---------------------------------------------- - -.. automodule:: tensorforce.meta_parameter_recorder - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.util module ------------------------- - -.. automodule:: tensorforce.util - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/docs/tensorforce/tensorforce.tests.rst b/docs/tensorforce/tensorforce.tests.rst deleted file mode 100644 index 2230801ef..000000000 --- a/docs/tensorforce/tensorforce.tests.rst +++ /dev/null @@ -1,177 +0,0 @@ -tensorforce\.tests package -========================== - -Submodules ----------- - -tensorforce\.tests\.base\_agent\_test module --------------------------------------------- - -.. automodule:: tensorforce.tests.base_agent_test - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.base\_test module -------------------------------------- - -.. automodule:: tensorforce.tests.base_test - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_constant\_agent module ------------------------------------------------- - -.. automodule:: tensorforce.tests.test_constant_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_ddqn\_agent module --------------------------------------------- - -.. automodule:: tensorforce.tests.test_ddqn_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_dqfd\_agent module --------------------------------------------- - -.. automodule:: tensorforce.tests.test_dqfd_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_dqn\_agent module -------------------------------------------- - -.. automodule:: tensorforce.tests.test_dqn_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_dqn\_memories module ----------------------------------------------- - -.. automodule:: tensorforce.tests.test_dqn_memories - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_dqn\_nstep\_agent module --------------------------------------------------- - -.. automodule:: tensorforce.tests.test_dqn_nstep_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_naf\_agent module -------------------------------------------- - -.. automodule:: tensorforce.tests.test_naf_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_ppo\_agent module -------------------------------------------- - -.. automodule:: tensorforce.tests.test_ppo_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_quickstart\_example module ----------------------------------------------------- - -.. automodule:: tensorforce.tests.test_quickstart_example - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_random\_agent module ----------------------------------------------- - -.. automodule:: tensorforce.tests.test_random_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_reward\_estimation module ---------------------------------------------------- - -.. automodule:: tensorforce.tests.test_reward_estimation - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_trpo\_agent module --------------------------------------------- - -.. automodule:: tensorforce.tests.test_trpo_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_tutorial\_code module ------------------------------------------------ - -.. automodule:: tensorforce.tests.test_tutorial_code - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_vpg\_agent module -------------------------------------------- - -.. automodule:: tensorforce.tests.test_vpg_agent - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_vpg\_baselines module ------------------------------------------------ - -.. automodule:: tensorforce.tests.test_vpg_baselines - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -tensorforce\.tests\.test\_vpg\_optimizers module ------------------------------------------------- - -.. automodule:: tensorforce.tests.test_vpg_optimizers - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tensorforce.tests - :members: - :undoc-members: - :inherited-members: - :show-inheritance: diff --git a/examples/__init__.py b/examples/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/act_experience_update_interface.py b/examples/act_experience_update_interface.py new file mode 100644 index 000000000..e32a0f53e --- /dev/null +++ b/examples/act_experience_update_interface.py @@ -0,0 +1,78 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import Agent, Environment + + +def main(): + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + + # Train for 100 episodes + for episode in range(100): + + # Record episode experience + episode_states = list() + episode_internals = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + # Episode using independent-act and agent.intial_internals() + states = environment.reset() + internals = agent.initial_internals() + terminal = False + sum_rewards = 0.0 + while not terminal: + episode_states.append(states) + episode_internals.append(internals) + actions, internals = agent.act(states=states, internals=internals, independent=True) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + sum_rewards += reward + print('Episode {}: {}'.format(episode, sum_rewards)) + + # Feed recorded experience to agent + agent.experience( + states=episode_states, internals=episode_internals, actions=episode_actions, + terminal=episode_terminal, reward=episode_reward + ) + + # Perform update + agent.update() + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(100): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + print('Mean evaluation return:', sum_rewards / 100.0) + + # Close agent and environment + agent.close() + environment.close() + + +if __name__ == '__main__': + main() diff --git a/examples/act_observe_interface.py b/examples/act_observe_interface.py new file mode 100644 index 000000000..9ce0c7cee --- /dev/null +++ b/examples/act_observe_interface.py @@ -0,0 +1,58 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import Agent, Environment + + +def main(): + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + + # Train for 100 episodes + for episode in range(100): + + # Episode using act and observe + states = environment.reset() + terminal = False + sum_rewards = 0.0 + num_updates = 0 + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + num_updates += agent.observe(terminal=terminal, reward=reward) + sum_rewards += reward + print('Episode {}: return={} updates={}'.format(episode, sum_rewards, num_updates)) + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(100): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + print('Mean evaluation return:', sum_rewards / 100.0) + + # Close agent and environment + agent.close() + environment.close() + + +if __name__ == '__main__': + main() diff --git a/examples/act_observe_vectorized.py b/examples/act_observe_vectorized.py new file mode 100644 index 000000000..8e3547f9f --- /dev/null +++ b/examples/act_observe_vectorized.py @@ -0,0 +1,69 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import Agent, Environment + + +def main(): + num_parallel = 8 + environment = Environment.create(environment='custom_cartpole', max_episode_timesteps=500) + agent = Agent.create( + agent='benchmarks/configs/ppo.json', environment=environment, + parallel_interactions=num_parallel + ) + + # Train for 100 episodes + for episode in range(0, 100, num_parallel): + + # Episode using act and observe + parallel, states = environment.reset(num_parallel=num_parallel) + terminal = (parallel < 0) # all false + sum_rewards = 0.0 + num_updates = 0 + while not terminal.all(): + actions = agent.act(states=states, parallel=parallel) + next_parallel, states, terminal, reward = environment.execute(actions=actions) + num_updates += agent.observe(terminal=terminal, reward=reward, parallel=parallel) + parallel = next_parallel + sum_rewards += reward.sum() + print('Episode {}: return={} updates={}'.format( + episode, sum_rewards / num_parallel, num_updates + )) + + # Evaluate for 100 episodes + num_parallel = 4 + num_episodes = 100 + sum_rewards = 0.0 + for _ in range(0, num_episodes, num_parallel): + parallel, states = environment.reset(num_parallel=num_parallel) + internals = agent.initial_internals() + internals = [internals for _ in range(num_parallel)] + terminal = (parallel < 0) # all false + while not terminal.all(): + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + _, states, terminal, reward = environment.execute(actions=actions) + internals = [internal for internal, term in zip(internals, terminal) if not term] + sum_rewards += reward.sum() + print('Mean evaluation return:', sum_rewards / num_episodes) + + # Close agent and environment + agent.close() + environment.close() + + +if __name__ == '__main__': + main() diff --git a/examples/action_masking.py b/examples/action_masking.py new file mode 100644 index 000000000..b4a7950b8 --- /dev/null +++ b/examples/action_masking.py @@ -0,0 +1,74 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np + +from tensorforce import Environment, Runner + + +class EnvironmentWithMasking(Environment): + """ + States: {0, 1, ..., 9, 10} + Actions: {-1, 0, 1} + Action masking: action = -1 invalid for state = 0, action = 1 invalid for state = 10 + Reward: + - Positive: [state < 5, action = 1] or [state > 5, action = -1] + - Negative: [state < 5, action = -1] or [state > 5, action = 1] + """ + + def __init__(self): + super().__init__() + + def states(self): + # States specification does not need to include action mask item + return dict(type=int, shape=(), num_values=11) + + def actions(self): + # Only discrete actions can be masked + return dict(type=int, shape=(), num_values=3) + + def reset(self): + # Initial state and associated action mask + self.state = np.random.randint(3, 7) + action_mask = np.asarray([self.state > 0, True, self.state < 10]) + + # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask") + states = dict(state=self.state, action_mask=action_mask) + + return states + + def execute(self, actions): + # Compute terminal and reward + terminal = False + if actions == 1: + reward = -np.abs(self.state / 5.0 - 1.0) + else: + reward = (1 - actions) * (self.state / 5.0 - 1.0) + + # Compute next state and associated action mask + self.state += actions - 1 + action_mask = np.asarray([self.state > 0, True, self.state < 10]) + + # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask") + states = dict(state=self.state, action_mask=action_mask) + + return states, terminal, reward + + +if __name__ == '__main__': + agent = 'benchmarks/configs/ppo.json' + runner = Runner(agent=agent, environment=EnvironmentWithMasking, max_episode_timesteps=20) + runner.run(num_episodes=100) + runner.close() diff --git a/examples/ale.py b/examples/ale.py deleted file mode 100644 index 306e50481..000000000 --- a/examples/ale.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Arcade Learning Environment execution -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import logging -import os -import sys -import time - -from tensorforce import TensorForceError -import json - -from tensorforce.agents import Agent -from tensorforce.execution import Runner -from tensorforce.contrib.ale import ALE - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('rom', help="File path of the rom") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-fs', '--frame-skip', help="Number of frames to repeat action", type=int, default=1) - parser.add_argument('-rap', '--repeat-action-probability', help="Repeat action probability", type=float, default=0.0) - parser.add_argument('-lolt', '--loss-of-life-termination', help="Loss of life counts as terminal state", action='store_true') - parser.add_argument('-lolr', '--loss-of-life-reward', help="Loss of life reward/penalty. EX: -1 to penalize", type=float, default=0.0) - parser.add_argument('-ds', '--display-screen', action='store_true', default=False, help="Display emulator screen") - parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") - parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") - parser.add_argument('-s', '--save', help="Save agent to this dir") - parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - - args = parser.parse_args() - - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) # configurable!!! - logger.addHandler(logging.StreamHandler(sys.stdout)) - - environment = ALE(args.rom, frame_skip=args.frame_skip, - repeat_action_probability=args.repeat_action_probability, - loss_of_life_termination=args.loss_of_life_termination, - loss_of_life_reward=args.loss_of_life_reward, - display_screen=args.display_screen) - - if args.agent_config is not None: - with open(args.agent_config, 'r') as fp: - agent_config = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network_spec is not None: - with open(args.network_spec, 'r') as fp: - network_spec = json.load(fp=fp) - else: - network_spec = None - logger.info("No network configuration provided.") - - agent = Agent.from_spec( - spec=agent_config, - kwargs=dict( - states_spec=environment.states, - actions_spec=environment.actions, - network_spec=network_spec - ) - ) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - if args.save: - save_dir = os.path.dirname(args.save) - if not os.path.isdir(save_dir): - try: - os.mkdir(save_dir, 0o755) - except OSError: - raise OSError("Cannot save agent to dir {} ()".format(save_dir)) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - report_episodes = args.episodes // 1000 - if args.debug: - report_episodes = 1 - - def episode_finished(r): - if r.episode % report_episodes == 0: - sps = r.timestep / (time.time() - r.start_time) - logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) - logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) - return True - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) - runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) - runner.close() - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) - - environment.close() - - -if __name__ == '__main__': - main() diff --git a/examples/carla_examples.py b/examples/carla_examples.py new file mode 100644 index 000000000..56e4d3417 --- /dev/null +++ b/examples/carla_examples.py @@ -0,0 +1,92 @@ +"""A collection of examples for CARLAEnvironment""" + +import pygame + +from tensorforce import Agent +from tensorforce.environments import CARLAEnvironment + + +def training_example(num_episodes: int, max_episode_timesteps: int): + # Instantiate the environment (run the CARLA simulator before doing this!) + env = CARLAEnvironment(debug=True) + + # Create your own agent (here is just an example) + agent = Agent.create(agent='ppo', + environment=env, + max_episode_timesteps=max_episode_timesteps, + batch_size=1) + + # Training loop (you couldn't use a Runner instead) + # `weights_dir` and `record_dir` are `None` to prevent saving and recording + env.train(agent=agent, + num_episodes=num_episodes, max_episode_timesteps=max_episode_timesteps, + weights_dir=None, record_dir=None) + + pygame.quit() + + +def custom_env_example(num_episodes: int, max_episode_timesteps: int): + # import some libs + import carla + import numpy as np + + from tensorforce.environments.carla_environment import CARLAEnvironment, SensorSpecs, env_utils + + # Subclass `CARLAEnvironment` to customize it: + class MyCARLAEnvironment(CARLAEnvironment): + # Change actions space: (throttle, steer, brake, reverse) + ACTIONS_SPEC = dict(type='float', shape=(4,), min_value=-1.0, max_value=1.0) + DEFAULT_ACTIONS = np.array([0.0, 0.0, 0.0, 0.0]) + + # Define your own mapping: actions -> carla.VehicleControl + def actions_to_control(self, actions): + self.control.throttle = float((actions[0] + 1) / 2.0) + self.control.steer = float(actions[1]) + self.control.brake = float((actions[2] + 1) / 2.0) + self.control.reverse = bool(actions[3] > 0) + self.control.hand_brake = False + + # Define which sensors to use: + def default_sensors(self) -> dict: + sensors = super().default_sensors() + + # Substitute the default rgb camera with a semantic segmentation camera + sensors['camera'] = SensorSpecs.segmentation_camera(position='front', attachment_type='Rigid', + image_size_x=self.window_size[0], + image_size_y=self.window_size[1], + sensor_tick=self.tick_time) + # Add a radar sensor + sensors['radar'] = SensorSpecs.radar(position='radar', sensor_tick=self.tick_time) + return sensors + + # Define a default agent (only used if env.train(agent=None, ...)) + def default_agent(self, **kwargs) -> Agent: + return Agent.create(agent='ppo', + environment=self, + max_episode_timesteps=kwargs.get('max_episode_timesteps'), + batch_size=1) + + # Define your own reward function: + def reward(self, actions, time_cost=-2.0): + speed = env_utils.speed(self.vehicle) + speed_limit = self.vehicle.get_speed_limit() + + if speed <= speed_limit: + speed_penalty = -1.0 if speed < speed_limit / 2 else 0.0 + else: + speed_penalty = speed_limit - speed + + return time_cost - self.collision_penalty * 2.0 + speed_penalty + + def render(self, sensors_data: dict): + super().render(sensors_data) + env_utils.draw_radar_measurement(debug_helper=self.world.debug, data=sensors_data['radar']) + + # Training: + env = MyCARLAEnvironment(debug=True) + + env.train(agent=None, # pass None to use the default_agent + num_episodes=num_episodes, max_episode_timesteps=max_episode_timesteps, + weights_dir=None, record_dir=None) + + pygame.quit() diff --git a/examples/configs/cnn_dqn2013_network.json b/examples/configs/cnn_dqn2013_network.json deleted file mode 100644 index fb97ec5ea..000000000 --- a/examples/configs/cnn_dqn2013_network.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "type": "conv2d", - "size": 16, - "window": 8, - "stride": 4, - "padding": "VALID" - }, - { - "type": "conv2d", - "size": 32, - "window": 4, - "stride": 2, - "padding": "VALID" - }, - { - "type": "flatten" - }, - { - "type": "dense", - "size": 256, - "activation": "relu" - } -] diff --git a/examples/configs/cnn_dqn_network.json b/examples/configs/cnn_dqn_network.json deleted file mode 100755 index 75ed07a43..000000000 --- a/examples/configs/cnn_dqn_network.json +++ /dev/null @@ -1,27 +0,0 @@ -[ - { - "type": "conv2d", - "size": 32, - "window": 8, - "stride": 4 - }, - { - "type": "conv2d", - "size": 64, - "window": 4, - "stride": 2 - }, - { - "type": "conv2d", - "size": 64, - "window": 3, - "stride": 1 - }, - { - "type": "flatten" - }, - { - "type": "dense", - "size": 512 - } -] diff --git a/examples/configs/ddpg.json b/examples/configs/ddpg.json deleted file mode 100755 index 1acdb5f79..000000000 --- a/examples/configs/ddpg.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "type": "ddpg_agent", - - "update_mode": { - "unit": "timesteps", - "batch_size": 64, - "frequency": 64 - }, - "memory": { - "type": "replay", - "capacity": 100000, - "include_next_states": true - }, - - "optimizer": { - "type": "adam", - "learning_rate": 1e-4 - }, - - "discount": 0.99, - "entropy_regularization": null, - - "critic_network": { - "size_t0": 64, - "size_t1": 64 - }, - "critic_optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - "target_sync_frequency": 1, - "target_update_weight": 0.999, - - "actions_exploration": { - "type": "ornstein_uhlenbeck", - "sigma": 0.3, - "mu": 0.0, - "theta": 0.15 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/dqfd.json b/examples/configs/dqfd.json deleted file mode 100644 index fb805fdfa..000000000 --- a/examples/configs/dqfd.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "type": "dqfd_agent", - - "update_mode": { - "unit": "timesteps", - "batch_size": 64, - "frequency": 4 - }, - "memory": { - "type": "replay", - "capacity": 10000, - "include_next_states": true - }, - - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - - "discount": 0.99, - "entropy_regularization": null, - - "target_sync_frequency": 1000, - "target_update_weight": 1.0, - - "actions_exploration": { - "type": "epsilon_decay", - "initial_epsilon": 0.5, - "final_epsilon": 0.0, - "timesteps": 10000 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/dqn.json b/examples/configs/dqn.json deleted file mode 100755 index 91db44a7b..000000000 --- a/examples/configs/dqn.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "type": "dqn_agent", - - "update_mode": { - "unit": "timesteps", - "batch_size": 64, - "frequency": 4 - }, - "memory": { - "type": "replay", - "capacity": 10000, - "include_next_states": true - }, - - "optimizer": { - "type": "clipped_step", - "clipping_value": 0.1, - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - } - }, - - "discount": 0.99, - "entropy_regularization": null, - "double_q_model": true, - - "target_sync_frequency": 1000, - "target_update_weight": 1.0, - - "actions_exploration": { - "type": "epsilon_anneal", - "initial_epsilon": 0.5, - "final_epsilon": 0.0, - "timesteps": 10000 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/dqn_ue4.json b/examples/configs/dqn_ue4.json deleted file mode 100644 index 3de8f2a6b..000000000 --- a/examples/configs/dqn_ue4.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "type": "dqn_agent", - - "update_mode": { - "unit": "timesteps", - "batch_size": 64, - "frequency": 4 - }, - - "memory": { - "type": "replay", - "capacity": 10000, - "include_next_states": true - }, - - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - - "discount": 0.97, - - "states_preprocessing": [ - { - "type": "divide", - "scale": 255 - }, - { - "type": "sequence", - "length": 4, - "add_rank": true - } - ], - - "actions_exploration": { - "type": "epsilon_decay", - "initial_epsilon": 1.0, - "final_epsilon": 0.1, - "timesteps": 100000 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/dqn_visual.json b/examples/configs/dqn_visual.json deleted file mode 100644 index 9b6f7a510..000000000 --- a/examples/configs/dqn_visual.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "type": "dqn_agent", - "batch_size": 64, - "memory": { - "type": "replay", - "capacity": 10000 - }, - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - - "discount": 0.97, - - "states_preprocessing": [ - { - "type": "image_resize", - "width": 84, - "height": 84 - }, - { - "type": "grayscale" - }, - { - "type": "divide", - "scale": 255 - } - ], - - "actions_exploration": { - "type": "epsilon_decay", - "initial_epsilon": 1.0, - "final_epsilon": 0.1, - "timesteps": 100000 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/mlp2_embedding_network.json b/examples/configs/mlp2_embedding_network.json deleted file mode 100755 index d83fd9fcf..000000000 --- a/examples/configs/mlp2_embedding_network.json +++ /dev/null @@ -1,15 +0,0 @@ -[ - { - "type": "embedding", - "indices": 100, - "size": 32 - }, - { - "type": "dense", - "size": 32 - }, - { - "type": "dense", - "size": 32 - } -] diff --git a/examples/configs/mlp2_lstm_network.json b/examples/configs/mlp2_lstm_network.json deleted file mode 100755 index 5b416fe6a..000000000 --- a/examples/configs/mlp2_lstm_network.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "type": "dense", - "size": 32 - }, - { - "type": "dense", - "size": 32 - }, - { - "type": "internal_lstm", - "size": 32 - } -] diff --git a/examples/configs/mlp2_network.json b/examples/configs/mlp2_network.json deleted file mode 100644 index f714a0474..000000000 --- a/examples/configs/mlp2_network.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "type": "dense", - "size": 32, - "activation": "relu" - }, - { - "type": "dense", - "size": 32, - "activation": "relu" - } -] diff --git a/examples/configs/mlp2_normalized_network.json b/examples/configs/mlp2_normalized_network.json deleted file mode 100644 index f9d7001a5..000000000 --- a/examples/configs/mlp2_normalized_network.json +++ /dev/null @@ -1,34 +0,0 @@ -[ - { - "type": "linear", - "size": 64 - }, - { - "type": "tf_layer", - "layer": "batch_normalization" - }, - { - "type": "nonlinearity", - "name": "relu" - }, - - - { - "type": "linear", - "size": 64 - }, - { - "type": "tf_layer", - "layer": "batch_normalization" - }, - { - "type": "nonlinearity", - "name": "relu" - }, - - { - "type": "dense", - "size": 64, - "activation": null - } -] diff --git a/examples/configs/naf.json b/examples/configs/naf.json deleted file mode 100644 index 44660d596..000000000 --- a/examples/configs/naf.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "type": "naf_agent", - - "update_mode": { - "unit": "timesteps", - "batch_size": 64, - "frequency": 4 - }, - "memory": { - "type": "replay", - "capacity": 10000, - "include_next_states": true - }, - - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - - "discount": 0.99, - "entropy_regularization": null, - "double_q_model": true, - - "target_sync_frequency": 1000, - "target_update_weight": 1.0, - - "actions_exploration": { - "type": "ornstein_uhlenbeck", - "sigma": 0.2, - "mu": 0.0, - "theta": 0.15 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/ppo.json b/examples/configs/ppo.json deleted file mode 100644 index e906ebe3d..000000000 --- a/examples/configs/ppo.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "type": "ppo_agent", - - "update_mode": { - "unit": "episodes", - "batch_size": 10, - "frequency": 10 - }, - "memory": { - "type": "latest", - "include_next_states": false, - "capacity": 5000 - }, - - "step_optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - "subsampling_fraction": 0.1, - "optimization_steps": 50, - - "discount": 0.99, - "entropy_regularization": 0.01, - "gae_lambda": null, - "likelihood_ratio_clipping": 0.2, - - "baseline_mode": "states", - "baseline": { - "type": "mlp", - "sizes": [32, 32] - }, - "baseline_optimizer": { - "type": "multi_step", - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - "num_steps": 5 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/trpo.json b/examples/configs/trpo.json deleted file mode 100644 index b402e20fc..000000000 --- a/examples/configs/trpo.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "type": "trpo_agent", - - "update_mode": { - "unit": "episodes", - "batch_size": 20, - "frequency": 20 - }, - "memory": { - "type": "latest", - "include_next_states": false, - "capacity": 5000 - }, - - "learning_rate": 1e-2, - - "discount": 0.99, - "entropy_regularization": null, - "gae_lambda": null, - "likelihood_ratio_clipping": null, - - "baseline_mode": null, - "baseline": null, - "baseline_optimizer": null, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/vpg.json b/examples/configs/vpg.json deleted file mode 100755 index 6534bbb5f..000000000 --- a/examples/configs/vpg.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "type": "vpg_agent", - - "update_mode": { - "unit": "episodes", - "batch_size": 20, - "frequency": 20 - }, - "memory": { - "type": "latest", - "include_next_states": false, - "capacity": 5000 - }, - - "optimizer": { - "type": "adam", - "learning_rate": 2e-2 - }, - - "discount": 0.99, - "entropy_regularization": null, - "gae_lambda": null, - - "baseline_mode": "states", - "baseline": { - "type": "mlp", - "sizes": [32, 32] - }, - "baseline_optimizer": { - "type": "multi_step", - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - "num_steps": 5 - }, - - "saver": { - "directory": null, - "seconds": 600 - }, - "summarizer": { - "directory": null, - "labels": [], - "seconds": 120 - } -} diff --git a/examples/configs/vpg_baseline_visual.json b/examples/configs/vpg_baseline_visual.json deleted file mode 100644 index fce4d4639..000000000 --- a/examples/configs/vpg_baseline_visual.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "type": "vpg_agent", - "batch_size": 4000, - "optimizer": { - "type": "adam", - "learning_rate": 1e-2 - }, - - "discount": 0.99, - "entropy_regularization": null, - "gae_lambda": 0.97, - - "baseline_mode": "states", - "baseline": { - "type": "mlp", - "sizes": [32, 32] - }, - "baseline_optimizer": { - "type": "multi_step", - "optimizer": { - "type": "adam", - "learning_rate": 1e-3 - }, - "num_steps": 5 - }, - - "states_preprocessing": [ - { - "type": "image_resize", - "width": 84, - "height": 84 - }, - { - "type": "grayscale" - }, - { - "type": "center" - }, - { - "type": "sequence", - "length": 4 - } - ] -} diff --git a/examples/export_saved_model.py b/examples/export_saved_model.py new file mode 100644 index 000000000..f215dab36 --- /dev/null +++ b/examples/export_saved_model.py @@ -0,0 +1,101 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import Environment, Runner + + +def main(): + # Train agent + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + runner = Runner(agent='benchmarks/configs/ppo.json', environment=environment) + runner.run(num_episodes=100) + + # Save agent SavedModel + runner.agent.save(directory='saved-model', format='saved-model') + runner.close() + + # Model serving, potentially using different programming language etc + # (For regular model saving and loading within Python, see save_load_agent.py example) + + # Load agent SavedModel + agent = tf.saved_model.load(export_dir='saved-model') + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(100): + states = environment.reset() + + # Required in case of internal states: + # internals = agent.initial_internals() + # internals = recursive_map(batch, internals) + + terminal = False + while not terminal: + + states = batch(states) + # Required in case of nested states: + # states = recursive_map(batch, states) + + auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) + deterministic = True + + actions = agent.act(states, auxiliaries, deterministic) + # Required in case of internal states: + # actions_internals = agent.act(states, internals, auxiliaries, deterministic) + # actions, internals = actions_internals['actions'], actions_internals['internals'] + + actions = unbatch(actions) + # Required in case of nested actions: + # actions = recursive_map(unbatch, actions) + + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + + print('Mean evaluation return:', sum_rewards / 100.0) + environment.close() + + +# Batch inputs +def batch(x): + return np.expand_dims(x, axis=0) + + +# Unbatch outputs +def unbatch(x): + if isinstance(x, tf.Tensor): # TF tensor to NumPy array + x = x.numpy() + if x.shape == (1,): # Singleton array to Python value + return x.item() + else: + return np.squeeze(x, axis=0) + + +# Apply function to leaf values in nested dict +# (required for nested states/actions) +def recursive_map(function, dictionary): + mapped = dict() + for key, value in dictionary.items(): + if isinstance(value, dict): + mapped[key] = recursive_map(function, value) + else: + mapped[key] = function(value) + return mapped + + +if __name__ == '__main__': + main() diff --git a/examples/lab_main.py b/examples/lab_main.py deleted file mode 100755 index 7c224ee2f..000000000 --- a/examples/lab_main.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Deepmind lab execution -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import logging -import os -import sys -import time -import json - -import numpy as np - -from tensorforce import TensorForceError -from tensorforce.agents import Agent - -# This was necessary for bazel, test if can be removed -logger = logging.getLogger(__name__) - -from tensorforce.contrib.deepmind_lab import DeepMindLab -from tensorforce.execution import Runner - - -def main(): - parser = argparse.ArgumentParser() - - # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce - # Hence, relative paths will not work without first fetching the path of this run file - parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes") - parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode") - parser.add_argument('-m', '--monitor', help="Save results to this directory") - parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") - parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") - parser.add_argument('-s', '--save', help="Save agent to this dir") - parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs") - - # Redirect output to file - sys.stdout = open('lab_output.txt', 'w') - - args = parser.parse_args() - - environment = DeepMindLab(args.level_id) - - path = os.path.dirname(__file__) - if args.agent_config: - # Use absolute path - agent_config = json.load(path + args.agent_config) - else: - raise TensorForceError("No agent configuration provided.") - - if not args.network_spec: - raise TensorForceError("No network configuration provided.") - else: - network_spec = json.load(path + args.network_config) - - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) # configurable!!! - - agent = Agent.from_spec( - spec=agent_config, - kwargs=dict( - states_spec=environment.states, - actions_spec=environment.actions, - network_spec=network_spec - ) - ) - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - if args.save: - save_dir = os.path.dirname(args.save) - if not os.path.isdir(save_dir): - try: - os.mkdir(save_dir, 0o755) - except OSError: - raise OSError("Cannot save agent to dir {} ()".format(save_dir)) - - report_episodes = args.episodes // 1000 - - def episode_finished(r): - if r.episode % report_episodes == 0: - sps = r.total_timesteps / (time.time() - r.start_time) - logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) - logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) - return True - - logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment)) - runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) - runner.close() - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1)) - - environment.close() - - -if __name__ == '__main__': - main() diff --git a/examples/maze_explorer.py b/examples/maze_explorer.py deleted file mode 100644 index 151131614..000000000 --- a/examples/maze_explorer.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Maze Explorer execution -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import logging -import os -import time - -from tensorforce import TensorForceError -import json - -from tensorforce.agents import Agent -from tensorforce.execution import Runner -from tensorforce.contrib.maze_explorer import MazeExplorer - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('--mode', help="ID of the game mode") - parser.add_argument('--hide', dest='hide', action='store_const', const=True, default=False, help="Hide output window") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") - parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") - parser.add_argument('-s', '--save', help="Save agent to this dir") - parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - - args = parser.parse_args() - - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) # configurable!!! - - environment = MazeExplorer(mode_id=args.mode, visible=not args.hide) - - if args.agent_config is not None: - with open(args.agent_config, 'r') as fp: - agent_config = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network_spec is not None: - with open(args.network_spec, 'r') as fp: - network_spec = json.load(fp=fp) - else: - network_spec = None - logger.info("No network configuration provided.") - - agent = Agent.from_spec( - spec=agent_config, - kwargs=dict( - states_spec=environment.states, - actions_spec=environment.actions, - network_spec=network_spec - ) - ) - - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - if args.save: - save_dir = os.path.dirname(args.save) - if not os.path.isdir(save_dir): - try: - os.mkdir(save_dir, 0o755) - except OSError: - raise OSError("Cannot save agent to dir {} ()".format(save_dir)) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - report_episodes = args.episodes // 1000 - if args.debug: - report_episodes = 1 - - def episode_finished(r): - if r.episode % report_episodes == 0: - sps = r.timestep / (time.time() - r.start_time) - logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) - logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) - return True - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) - runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) - runner.close() - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) - - environment.close() - - -if __name__ == '__main__': - main() diff --git a/examples/multiactor_environment.py b/examples/multiactor_environment.py new file mode 100644 index 000000000..466c73078 --- /dev/null +++ b/examples/multiactor_environment.py @@ -0,0 +1,87 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np + +from tensorforce import Environment, Runner + + +class MultiactorEnvironment(Environment): + """ + Example multi-actor environment, illustrating best-practice implementation pattern. + + State space: position in [0, 10]. + Action space: movement in {-1, 0, 1}. + Random start in [3, 7]. + Actor 1 perspective as is, actor 2 perspective mirrored. + Positive reward for being closer to 10. + """ + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='int', num_values=11) + + def actions(self): + return dict(type='int', num_values=3) + + def num_actors(self): + return 2 # Indicates that environment has multiple actors + + def reset(self): + # Always for multi-actor environments: initialize parallel indices + self._parallel_indices = np.arange(self.num_actors()) + + # Single shared environment logic, plus per-actor perspective + self._states = 3 + np.random.randint(5) + self.second_actor = True + states = np.stack([self._states, 10 - self._states], axis=0) + + # Always for multi-actor environments: return per-actor values + return self._parallel_indices.copy(), states + + def execute(self, actions): + # Single shared environment logic, plus per-actor perspective + if self.second_actor: + self.second_actor = self.second_actor and not (np.random.random_sample() < 0.1) + terminal = np.stack([False, not self.second_actor], axis=0) + delta = (actions[0] - 1) - (actions[1] - 1) + self._states = np.clip(self._states + delta, a_min=0, a_max=10) + states = np.stack([self._states, 10 - self._states], axis=0) + else: + terminal = np.stack([False], axis=0) + delta = (actions[0] - 1) + self._states = np.clip(self._states + delta, a_min=0, a_max=10) + states = np.stack([self._states], axis=0) + reward = (states - 5.0) / 5.0 + + # Always for multi-actor environments: update parallel indices, and return per-actor values + self._parallel_indices = self._parallel_indices[~terminal] + return self._parallel_indices.copy(), states, terminal, reward + + +def main(): + # Multi-actor runner, automatically if environment.num_actors() > 1 + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=MultiactorEnvironment, + max_episode_timesteps=10 + ) + runner.run(num_episodes=1000) + + +if __name__ == '__main__': + main() diff --git a/examples/openai_gym.py b/examples/openai_gym.py deleted file mode 100644 index cc542b4a2..000000000 --- a/examples/openai_gym.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -OpenAI gym execution. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import json -import logging -import os -import time - -from tensorforce import TensorForceError -from tensorforce.agents import Agent -from tensorforce.execution import Runner -from tensorforce.contrib.openai_gym import OpenAIGym - - -# python examples/openai_gym.py Pong-ram-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 50000 -m 2000 - -# python examples/openai_gym.py CartPole-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 2000 -m 200 - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('gym_id', help="Id of the Gym environment") - parser.add_argument('-a', '--agent', help="Agent configuration file") - parser.add_argument('-n', '--network', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") - parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") - parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") - parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('--monitor', help="Save results to this directory") - parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") - parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO) - - logger = logging.getLogger(__file__) - logger.setLevel(logging.INFO) - - environment = OpenAIGym( - gym_id=args.gym_id, - monitor=args.monitor, - monitor_safe=args.monitor_safe, - monitor_video=args.monitor_video - ) - - if args.agent is not None: - with open(args.agent, 'r') as fp: - agent = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network is not None: - with open(args.network, 'r') as fp: - network = json.load(fp=fp) - else: - network = None - logger.info("No network configuration provided.") - - agent = Agent.from_spec( - spec=agent, - kwargs=dict( - states=environment.states, - actions=environment.actions, - network=network - ) - ) - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - if args.debug: # TODO: Timestep-based reporting - report_episodes = 1 - else: - report_episodes = 100 - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) - - def episode_finished(r): - if r.episode % report_episodes == 0: - steps_per_second = r.timestep / (time.time() - r.start_time) - logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format( - r.agent.episode, r.episode_timestep, steps_per_second - )) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {:0.2f}".format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) - logger.info("Average of last 100 rewards: {:0.2f}".format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) - return True - - runner.run( - timesteps=args.timesteps, - episodes=args.episodes, - max_episode_timesteps=args.max_episode_timesteps, - deterministic=args.deterministic, - episode_finished=episode_finished - ) - runner.close() - - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode)) - - -if __name__ == '__main__': - main() diff --git a/examples/openai_gym_async.py b/examples/openai_gym_async.py deleted file mode 100644 index 21f554472..000000000 --- a/examples/openai_gym_async.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -OpenAI gym execution - -To run this script with 3 workers on Pong-ram-v0: -$ python examples/openai_gym_async.py Pong-ram-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 50000 -m 2000 -W 3 - -Or on CartPole-v0: -$ python examples/openai_gym_async.py CartPole-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 10000 -m 200 -W 3 - -You can check what the workers are doing: -$ tmux a -t OpenAI # `ctrl+b d` to exit tmux - -To kill the session: -$ python examples/openai_gym_async.py CartPole-v0 -W 3 -K -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import inspect -import json -import logging -import os -import sys -import time - -import tensorflow as tf -from six.moves import xrange, shlex_quote - -from tensorforce import TensorForceError -from tensorforce.agents import Agent -from tensorforce.execution import Runner -from tensorforce.contrib.openai_gym import OpenAIGym - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('gym_id', help="Id of the Gym environment") - parser.add_argument('-a', '--agent', help="Agent configuration file") - parser.add_argument('-n', '--network', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") - parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") - parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") - parser.add_argument('-d', '--deterministic', action='store_true', help="Choose actions deterministically") - parser.add_argument('-M', '--mode', choices=('tmux', 'child'), default='tmux', help="Starter mode") - parser.add_argument('-W', '--num-workers', type=int, default=1, help="Number of worker agents") - parser.add_argument('-C', '--child', action='store_true', help="Child process") - parser.add_argument('-P', '--parameter-server', action='store_true', help="Parameter server") - parser.add_argument('-I', '--task-index', type=int, default=0, help="Task index") - parser.add_argument('-K', '--kill', action='store_true', help="Kill runners") - parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") - parser.add_argument('-D', '--debug', action='store_true', help="Show debug outputs") - - args = parser.parse_args() - - session_name = 'OpenAI-' + args.gym_id - shell = '/bin/bash' - - kill_cmds = [ - "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers), - "tmux kill-session -t {}".format(session_name), - ] - if args.kill: - os.system("\n".join(kill_cmds)) - return 0 - - if not args.child: - # start up child processes - target_script = os.path.abspath(inspect.stack()[0][1]) - - def wrap_cmd(session, name, cmd): - if isinstance(cmd, list): - cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) - if args.mode == 'tmux': - return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd)) - elif args.mode == 'child': - return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( - cmd, args.logdir, session, name, args.logdir - ) - - def build_cmd(ps, index): - cmd_args = [ - 'CUDA_VISIBLE_DEVICES=', - sys.executable, target_script, - args.gym_id, - '--agent', os.path.join(os.getcwd(), args.agent), - '--network', os.path.join(os.getcwd(), args.network), - '--num-workers', args.num_workers, - '--child', - '--task-index', index - ] - if args.episodes is not None: - cmd_args.append('--episodes') - cmd_args.append(args.episodes) - if args.timesteps is not None: - cmd_args.append('--timesteps') - cmd_args.append(args.timesteps) - if args.max_episode_timesteps is not None: - cmd_args.append('--max-episode-timesteps') - cmd_args.append(args.max_episode_timesteps) - if args.deterministic: - cmd_args.append('--deterministic') - if ps: - cmd_args.append('--parameter-server') - if args.debug: - cmd_args.append('--debug') - return cmd_args - - if args.mode == 'tmux': - cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)] - elif args.mode == 'child': - cmds = ['mkdir -p {}'.format(args.logdir), - 'rm -f {}/kill.sh'.format(args.logdir), - 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), - 'chmod +x {}/kill.sh'.format(args.logdir)] - - cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0))) - - for i in xrange(args.num_workers): - name = 'worker{}'.format(i) - if args.mode == 'tmux': - cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell)) - cmds.append(wrap_cmd(session_name, name, build_cmd(ps=False, index=i))) - - # add one PS call - # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) - - print("\n".join(cmds)) - - os.system("\n".join(cmds)) - - return 0 - - ps_hosts = ['127.0.0.1:{}'.format(12222)] - worker_hosts = [] - port = 12223 - for _ in range(args.num_workers): - worker_hosts.append('127.0.0.1:{}'.format(port)) - port += 1 - cluster = {'ps': ps_hosts, 'worker': worker_hosts} - cluster_spec = tf.train.ClusterSpec(cluster) - - environment = OpenAIGym(args.gym_id) - - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) # log_levels[agent.log_level]) - - if args.agent is not None: - with open(args.agent, 'r') as fp: - agent = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network is not None: - with open(args.network, 'r') as fp: - network = json.load(fp=fp) - else: - network = None - logger.info("No network configuration provided.") - - if args.parameter_server: - agent['device'] = '/job:ps/task:{}'.format(args.task_index) # '/cpu:0' - else: - agent['device'] = '/job:worker/task:{}'.format(args.task_index) # '/cpu:0' - - agent['distributed'] = dict( - cluster_spec=cluster_spec, - task_index=args.task_index, - parameter_server=args.parameter_server, - protocol='grpc' - ) - - agent = Agent.from_spec( - spec=agent, - kwargs=dict( - states=environment.states, - actions=environment.actions, - network=network - ) - ) - - logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id)) - logger.info("Config:") - logger.info(agent) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - if args.debug: # TODO: Timestep-based reporting - report_episodes = 1 - else: - report_episodes = 100 - - def episode_finished(r): - if r.episode % report_episodes == 0: - steps_per_second = r.timestep / (time.time() - r.start_time) - logger.info("Finished episode {} after overall {} timesteps. Steps Per Second {}".format( - r.agent.episode, - r.agent.timestep, - steps_per_second) - ) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) - logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) - return True - - runner.run( - timesteps=args.timesteps, - episodes=args.episodes, - max_episode_timesteps=args.max_episode_timesteps, - deterministic=args.deterministic, - episode_finished=episode_finished - ) - runner.close() - - -if __name__ == '__main__': - main() diff --git a/examples/openai_universe.py b/examples/openai_universe.py deleted file mode 100644 index 8903a1563..000000000 --- a/examples/openai_universe.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -OpenAI universe example. - -In order to use openai universe, please make sure you have docker installed. - -Then use like this: - -``python examples/openai_universe.py -a DQNAgent -c examples/configs/dqn_agent.json -n examples/configs/dqn_network.json flashgames.DuskDrive-v0`` - -This will create a docker session that you can connect to by visiting - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import logging -import os -import time - -from tensorforce import TensorForceError -from tensorforce.agents import Agent -import json -from tensorforce.execution import Runner -from tensorforce.contrib.openai_universe import OpenAIUniverse - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('gym_id', help="ID of the gym environment") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") - parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode") - # parser.add_argument('-m', '--monitor', help="Save results to this directory") - # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") - # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") - parser.add_argument('-s', '--save', help="Save agent to this dir") - parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - - args = parser.parse_args() - - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) - environment = OpenAIUniverse(args.gym_id) - environment.configure(remotes=1) - - if args.agent_config is not None: - with open(args.agent_config, 'r') as fp: - agent_config = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network_spec: - with open(args.network_spec, 'r') as fp: - network_spec = json.load(fp=fp) - else: - network_spec = None - logger.info("No network configuration provided.") - - agent = Agent.from_spec( - spec=agent_config, - kwargs=dict( - states_spec=environment.states, - actions_spec=environment.actions, - network_spec=network_spec - ) - ) - - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.load_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - report_episodes = args.episodes // 1000 - if args.debug: - report_episodes = 1 - def episode_finished(r): - if r.episode % report_episodes == 0: - steps_per_second = r.timestep / (time.time() - r.start_time) - logger.info("Finished episode {} after {} timesteps. Steps Per Second {}".format( - r.episode, r.episode_timestep, steps_per_second - )) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) - logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) - return True - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) - runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) - runner.close() - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) - - # if args.monitor: - # environment.gym.monitor.close() - environment.close() - - -if __name__ == '__main__': - main() diff --git a/examples/parallelization.py b/examples/parallelization.py new file mode 100644 index 000000000..130679cf2 --- /dev/null +++ b/examples/parallelization.py @@ -0,0 +1,109 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from threading import Thread + +from tensorforce import Environment, Runner + + +def main(): + local() + local_vectorized() + multiprocessing() + socket() + + +def local(): + """ + Train agent on experience collected in parallel from 4 local CartPole environments. + + Typical use case: + time for batched agent.act() ~ time for agent.act() > time for environment.execute() + """ + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + runner = Runner(agent=agent, environment=environment, num_parallel=4) + # Batch act/observe calls to agent, unless environment.is_vectorizable() + # (otherwise essentially equivalent to single environment) + runner.run(num_episodes=100, batch_agent_calls=True) + runner.close() + + +def local_vectorized(): + """ + Train agent on experience collected in parallel from one vectorized CartPole environment. + + Typical use case: + time for vectorized environment < time for sequential execution + """ + agent = 'benchmarks/configs/ppo.json' + environment = 'custom_cartpole' + runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500, num_parallel=4) + runner.run(num_episodes=100) + runner.close() + + +def multiprocessing(): + """ + Train agent on experience collected in parallel from 4 CartPole environments running in + separate processes. + + Typical use case: + (a) time for batched agent.act() ~ time for agent.act() + > time for environment.execute() + remote communication + --> batch_agent_calls = True + (b) time for environment.execute() > time for agent.act() + process communication + --> batch_agent_calls = False + """ + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + runner = Runner(agent=agent, environment=environment, num_parallel=4, remote='multiprocessing') + runner.run(num_episodes=100, batch_agent_calls=True) # optional: batch_agent_calls=True + runner.close() + + +def socket(): + """ + Train agent on experience collected in parallel from 2 CartPole environments running on + another machine. + + Typical use case: same as mode 2, but generally remote communication socket > process + + Simulate remote environment, usually run on another machine via: + python run.py --environment gym --level CartPole-v1 --remote socket-server --port 65432 + """ + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + + def server(port): + Environment.create(environment=environment, remote='socket-server', port=port) + + server1 = Thread(target=server, kwargs=dict(port=65432)) + server2 = Thread(target=server, kwargs=dict(port=65433)) + server1.start() + server2.start() + + runner = Runner( + agent=agent, num_parallel=2, remote='socket-client', host='127.0.0.1', port=65432 + ) + runner.run(num_episodes=100) # optional: batch_agent_calls=True + runner.close() + + server1.join() + server2.join() + + +if __name__ == '__main__': + main() diff --git a/examples/quickstart.py b/examples/quickstart.py index 4f4d63a5d..eb4768533 100644 --- a/examples/quickstart.py +++ b/examples/quickstart.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,96 +13,56 @@ # limitations under the License. # ============================================================================== -import numpy as np +from tensorforce import Runner -from tensorforce.agents import PPOAgent -from tensorforce.execution import Runner -from tensorforce.contrib.openai_gym import OpenAIGym -# Create an OpenAIgym environment. -environment = OpenAIGym('CartPole-v0', visualize=False) +def main(): + # OpenAI-Gym environment specification + environment = dict(environment='gym', level='CartPole-v1') + # or: environment = Environment.create( + # environment='gym', level='CartPole-v1', max_episode_timesteps=500) -# Network as list of layers -# - Embedding layer: -# - For Gym environments utilizing a discrete observation space, an -# "embedding" layer should be inserted at the head of the network spec. -# Such environments are usually identified by either: -# - class ...Env(discrete.DiscreteEnv): -# - self.observation_space = spaces.Discrete(...) + # PPO agent specification + agent = dict( + agent='ppo', + # Automatically configured network + network='auto', + # PPO optimization parameters + batch_size=10, update_frequency=2, learning_rate=3e-4, multi_step=10, + subsampling_fraction=0.33, + # Reward estimation + likelihood_ratio_clipping=0.2, discount=0.99, predict_terminal_values=False, + reward_processing=None, + # Baseline network and optimizer + baseline=dict(type='auto', size=32, depth=1), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3, multi_step=10), + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Default additional config values + config=None, + # Save agent every 10 updates and keep the 5 most recent checkpoints + saver=dict(directory='model', frequency=10, max_checkpoints=5), + # Log all available Tensorboard summaries + summarizer=dict(directory='summaries', summaries='all'), + # Do not record agent-environment interaction trace + recorder=None + ) + # or: Agent.create(agent='ppo', environment=environment, ...) + # with additional argument "environment" and, if applicable, "parallel_interactions" -network_spec = [ - # dict(type='embedding', indices=100, size=32), - dict(type='dense', size=32), - dict(type='dense', size=32) -] + # Initialize the runner + runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500) -agent = PPOAgent( - states=environment.states, - actions=environment.actions, - network=network_spec, - # Agent - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - # MemoryModel - update_mode=dict( - unit='episodes', - # 10 episodes per update - batch_size=20, - # Every 10 episodes - frequency=20 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=5000 - ), - # DistributionModel - distributions=None, - entropy_regularization=0.01, - # PGModel - baseline_mode='states', - baseline=dict( - type='mlp', - sizes=[32, 32] - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ), - gae_lambda=0.97, - # PGLRModel - likelihood_ratio_clipping=0.2, - # PPOAgent - step_optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - subsampling_fraction=0.2, - optimization_steps=25 -) + # Train for 200 episodes + runner.run(num_episodes=200) + runner.close() -# Create the runner -runner = Runner(agent=agent, environment=environment) + # plus agent.close() and environment.close() if created separately -# Callback function printing episode statistics -def episode_finished(r): - print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, - reward=r.episode_rewards[-1])) - return True - - -# Start learning -runner.run(episodes=3000, max_episode_timesteps=200, episode_finished=episode_finished) -runner.close() - -# Print statistics -print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format( - ep=runner.episode, - ar=np.mean(runner.episode_rewards[-100:])) -) +if __name__ == '__main__': + main() diff --git a/examples/record_and_pretrain.py b/examples/record_and_pretrain.py new file mode 100644 index 000000000..32d37096c --- /dev/null +++ b/examples/record_and_pretrain.py @@ -0,0 +1,112 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os + +import numpy as np + +from tensorforce import Agent, Environment, Runner + + +def main(): + # Record experience traces + record_ppo_config(directory='ppo-traces') + # Alternatively: + # record_custom_act_function(directory='ppo-traces') + # write_custom_recording_file(directory='ppo-traces') + + # Pretrain a new agent on the recorded traces: for 30 iterations, feed the + # experience of one episode to the agent and subsequently perform one update + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + agent.pretrain(directory='ppo-traces', num_iterations=30, num_traces=1, num_updates=1) + + # Evaluate the pretrained agent + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=100, evaluation=True) + runner.close() + + # Close agent and environment + agent.close() + environment.close() + + +def record_ppo_config(directory): + # Start recording traces after 80 episodes -- by then, the environment is solved + runner = Runner( + agent=dict( + agent='benchmarks/configs/ppo.json', + recorder=dict(directory=directory, start=80) + ), environment='benchmarks/configs/cartpole.json' + ) + runner.run(num_episodes=100) + runner.close() + + +def record_custom_act_function(directory): + # Trivial custom act function + def fn_act(states): + return int(states[2] < 0.0) + + # Record 20 episodes + runner = Runner( + agent=dict(agent=fn_act, recorder=dict(directory=directory)), + environment='benchmarks/configs/cartpole.json' + ) + # or: agent = Agent.create(agent=fn_act, recorder=dict(directory=directory)) + runner.run(num_episodes=20) + runner.close() + + +def write_custom_recording_file(directory): + # Start recording traces after 80 episodes -- by then, the environment is solved + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=80) + runner.close() + + # Record 20 episodes + for episode in range(20): + + # Record episode experience + episode_states = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + # Evaluation episode + states = environment.reset() + terminal = False + while not terminal: + episode_states.append(states) + actions = agent.act(states=states, independent=True, deterministic=True) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + + # Write recorded episode trace to npz file + np.savez_compressed( + file=os.path.join(directory, 'trace-{:09d}.npz'.format(episode)), + states=np.stack(episode_states, axis=0), + actions=np.stack(episode_actions, axis=0), + terminal=np.stack(episode_terminal, axis=0), + reward=np.stack(episode_reward, axis=0) + ) + + +if __name__ == '__main__': + main() diff --git a/examples/save_load_agent.py b/examples/save_load_agent.py new file mode 100644 index 000000000..48bb4b029 --- /dev/null +++ b/examples/save_load_agent.py @@ -0,0 +1,65 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import Agent, Environment, Runner + + +def main(): + # OpenAI-Gym environment initialization + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + + # PPO agent initialization + agent = Agent.create( + agent='benchmarks/configs/ppo.json', environment=environment, + # Option 1: Saver - save agent periodically every 10 updates + # and keep the 5 most recent checkpoints + saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5), + ) + + # Runner initialization + runner = Runner(agent=agent, environment=environment) + + # Training + runner.run(num_episodes=100) + runner.close() + + # Option 2: Explicit save + # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, + # agent argument saver, specified above, uses 'checkpoint') + agent.save(directory='model-numpy', format='numpy', append='episodes') + + # Close agent separately, since created separately + agent.close() + + # Load agent TensorFlow checkpoint + agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=100, evaluation=True) + runner.close() + agent.close() + + # Load agent NumPy weights + agent = Agent.load(directory='model-numpy', format='numpy', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=100, evaluation=True) + runner.close() + agent.close() + + # Close environment separately, since created separately + environment.close() + + +if __name__ == '__main__': + main() diff --git a/examples/temperature-controller.ipynb b/examples/temperature-controller.ipynb new file mode 100755 index 000000000..5c74364cc --- /dev/null +++ b/examples/temperature-controller.ipynb @@ -0,0 +1,479 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Environment and RL Agent Controller for a Thermostat\n", + "\n", + "```\n", + "Author: Matt Pettis\n", + "Github: mpettis\n", + "Twitter: @mtpettis\n", + "Date: 2020-04-27\n", + "```\n", + "\n", + "This is a toy example of a room with a heater. When the heater is off, the temperature will decay to 0.0, and when it is on, it will rise to 1.0. The decay and rise is not instantaneous, but has exponential decay behavior in time given by the following formula:\n", + "\n", + " temperature[i + 1] = heater[i] + (temperature[i] - heater[i]) * exp(-1/tau)\n", + "\n", + "Where:\n", + "\n", + " temperature[i] is the temperature at timestep i (between 0 and 1).\n", + " heater[i] is the applied heater, 0 when not applied, 1 when applied.\n", + " tau is the characteristic heat decay constant.\n", + "\n", + "So, when the heater is off, the temperature will decay towards 0, and when the heater is on, it will rise towards 1. When the heater is toggled on/off, it will drift towards 1/0.\n", + "\n", + "Here is a sample plot of what the temperature response looks like when the heater is on for a while, then off for a while. You will see the characteristic rise and decay of the temperature to the response." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAuuUlEQVR4nO3deXhU5dnH8e+dZRIyYcmEALKDgrIjBGRRoaKAG6AVhVZFBZW2Vq27by2uXaytWq1LqUUQF8CdCspiQURFNhFkEaKAhN0kLCFASPK8fzwzOMaQTJKZObPcn+uaK5OZM3PuHCY/Tp7zLGKMQSmlVPRLcLoApZRSwaGBrpRSMUIDXSmlYoQGulJKxQgNdKWUihFJTu24YcOGpnXr1k7tXimlotKKFSu+N8ZkVfScY4HeunVrli9f7tTulVIqKonI1hM9p00uSikVIzTQlVIqRmigK6VUjHCsDV0pFduOHTtGbm4uR44ccbqUqJSamkrz5s1JTk4O+DUa6EqpkMjNzaVu3bq0bt0aEXG6nKhijCEvL4/c3FzatGkT8OuqbHIRkUkiskdEvjrB8yIiT4lIjoisFpEe1ahbKRWjjhw5QmZmpoZ5DYgImZmZ1f7rJpA29MnA0EqePx9o573dADxXrQqUUjFLw7zmanLsqmxyMcYsEpHWlWwyHHjJ2Hl4l4hIAxE5yRizs9rVBGDZlnw+3rg3FG+t4kR6ahLX9W9DUqL2CVCxJRht6M2AbX7f53of+0mgi8gN2LN4WrZsWaOdrdxawNMLcmr0WqV80/+f3jKDXq09zhajIsbChQtxuVz069cPgOeff560tDSuvvpqhyurnrBeFDXGTAQmAmRnZ9doZY0bB5zMjQNODmpdKn6s3bGfC59aTF7hUadLURFk4cKFpKenHw/08ePHO1xRzQTjb87tQAu/75t7H1Mq4njcLgDyDhU7XIkKhxEjRtCzZ086derExIkTAfjggw/o0aMH3bp1Y9CgQWzZsoXnn3+eJ554gu7du/Pxxx/zwAMP8Le//Q2AVatW0adPH7p27coll1xCQUEBAAMHDuTuu++md+/etG/fno8//tixn9MnGGfoM4GbRGQacAawP1Tt50rVli/Q8ws10MPpwf+uZd2OA0F9z45N63H/xZ0q3WbSpEl4PB4OHz5Mr169GD58ONdffz2LFi2iTZs25Ofn4/F4GD9+POnp6dxxxx0AfPjhh8ff4+qrr+bpp59mwIABTJgwgQcffJAnn3wSgJKSEpYuXcrs2bN58MEHmT9/flB/xuqqMtBF5DVgINBQRHKB+4FkAGPM88Bs4AIgBygCrg1VsUrVVkpSIukpSeQXaaDHg6eeeoq3334bgG3btjFx4kTOPvvs4327PZ7Kr6Ps37+fffv2MWDAAADGjBnDyJEjjz9/6aWXAtCzZ0+2bNkSgp+gegLp5TK6iucN8JugVaRUiHncLvK1ySWsqjqTDoWFCxcyf/58PvvsM9LS0hg4cCDdu3dnw4YNQdtHSkoKAImJiZSUlATtfWtK+22p+FBWBgVbYMNsLktYSEHhYacrUiG2f/9+MjIySEtLY8OGDSxZsoQjR46waNEiNm/eDEB+fj4AdevW5eDBgz95j/r165ORkXG8fXzq1KnHz9YjkQ79V7Hn0Pewey3sWQ971sLudbB3AxQXAnAz8JBJB/o5WqYKraFDh/L888/ToUMHTj31VPr06UNWVhYTJ07k0ksvpaysjEaNGjFv3jwuvvhiLrvsMt59912efvrpH73PlClTGD9+PEVFRbRt25YXX3zRoZ+oamJMjXoP1lp2drbRBS5UrRQfgj0bbGjvWe8N8XVwyG/gWR0PNO4EjTpCow7QqAOHJ1/KHPoxYsIbztUeB9avX0+HDh2cLiOqVXQMRWSFMSa7ou31DF1Fh9JjsPNL2PoJbFsKu7+Cgq2A94QkqQ40Og3aDYHGvvDuBOmNoNwQ6i0N+tA3bzmmrAxJ0FZHFTs00FVkOnYEtq+wAe4L8WNF9jnPyXBSd+j+yx/OvDPaQIDhvLPxADrkf0jRti9Ia9UzdD+DUmGmga4iw9FC2PY5bP3U3rYvh9JiQKBxZzj9KmjVz97SG9VqVwebD6RsnVC87n0NdBVTNNCVMw4XwHdLvGfgn8KOVWBKQRKhaXc440Zo1R9anAFpwZ1zJd3ThC/MKZz2zVzgvqC+t1JO0kBX4bNnA6x9GzbMsm3gGEh0QbNsOPN39uy7xRmQkh7SMjxuF/NKT6fn9zPg4G6o2zik+1MqXDTQVWjlfQNfvQVr37I9UBAb3D/7P/u1WTYkp4a1JI/bxYKy07mLGbBpLvS4Kqz7VypUNNBV8BVssWfiX70Fu1bbx1r2hfMfg47DoG4TR8vzuF2sNy0pTGlM+qY5GugqZmigq+DYn/tDiO9YaR9rlg1D/gQdR0D9Zo6W5y89JQlXYiIb6/ejxzfzoOQoJKU4XZYKMWMMxhgSYrirauz+ZCr0DuyEJc/DfwbDE51g7n1gyuDcB+GW1XD9h9D3NxEV5mCX9vK4XXyR2seOHt36idMlqRDZsmULp556KldffTWdO3fm4YcfplevXnTt2pX7778fgEOHDnHhhRfSrVs3OnfuzPTp0wFo3bo1d911F126dKF3797k5OQcf89zzjmHrl27MmjQIL777jsArrnmGm6++Wb69etH27ZteeMNO3Bt586dnH322XTv3p3OnTsfn0Zg7ty59O3blx49ejBy5EgKCwtr/fPqGbqqniMHYM0Meya+9VPA2G6F5/wBOl0CmdGx+EiG28UyOjM2qQ5snAMnn+N0SbHt/Xtg15rgvmeTLnD+X6rcbNOmTUyZMoUDBw7wxhtvsHTpUowxDBs2jEWLFrF3716aNm3KrFmzADsHjE/9+vVZs2YNL730Erfeeivvvfcev/3tbxkzZgxjxoxh0qRJ3HzzzbzzzjuADe/FixezYcMGhg0bxmWXXcarr77KkCFD+P3vf09paSlFRUV8//33PPLII8yfPx+3282jjz7K448/zoQJE2p1SDTQVWAK98Lnz8HSF+Dofmh4Kgy8BzpdClntna6u2jLdLnYdLoG2A+Dr92HoX34yolTFhlatWtGnTx/uuOMO5s6dy+mnnw5AYWEhmzZt4qyzzuL222/n7rvv5qKLLuKss846/trRo0cf//q73/0OgM8++4y33noLgKuuuoq77rrr+PYjRowgISGBjh07snv3bgB69erFddddx7FjxxgxYgTdu3fno48+Yt26dfTv3x+A4uJi+vbtW+ufVQNdVW7fd/Dp07ByKpQcgQ4Xw5m3QtMeUR2AHreLbQVFcMYQ2PgBfL8Rsk51uqzYFcCZdKi43W7AtqHfe++93HjjjT/ZZuXKlcyePZv77ruPQYMGHT9TFr/PuATwefdNp+vbH8DZZ5/NokWLmDVrFtdccw233XYbGRkZnHfeebz22mu1+tnK0zZ0VbG9X8Pbv4KnToflk6Dzz+E3S+GKqdCsZ1SHOXjnRC8stnO/gA11FdOGDBnCpEmTjrdVb9++nT179rBjxw7S0tK48sorufPOO1m5cuXx1/ja06dPn378DLpfv35MmzYNgFdeeeVHZ/QV2bp1K40bN+b6669n3LhxrFy5kj59+vDJJ58cb5c/dOgQGzdurPXPqGfo6se2r4CPH7eDf5JSodc46HsTNGhR9WujiMft4uDREo66m5DSpIttR+9/i9NlqRAaPHgw69evPx7M6enpvPzyy+Tk5HDnnXeSkJBAcnIyzz333PHXFBQU0LVrV1JSUo6fTT/99NNce+21PPbYY2RlZVU5ne7ChQt57LHHSE5OJj09nZdeeomsrCwmT57M6NGjOXrULlj+yCOP0L597ZovdfpcBcbA5o9skG/+CFLqwxk3wBnjwd3Q6epC4uUlW7nvna9Ycu8gmix/DBY/AXd9A3UynC4tZkT79LmtW7dm+fLlNGzo3O+ATp+rAldWBl/PhsWP2zNzdyPb5TD7Okit53R1IZXpWyz6UDFN2g+Fj/8GOR9Cl8scrkypmtNAj0elx2DNG/DJk3Ylnwat4MLH7XS0YR6G7xSPX6Bzcg9Ia2ibXTTQlVckLPpcXRro8Wb9f+GD/4P939m5xC99wfYfT4yvj4Iv0PMOHYWERGg3GDa+D6UlcXcsQskYE1DvEPVTNWkO114u8eLQ9/D6NTD9SkitD6OnwfhPoOvIuAwwX6AXHCq2D7QfYqf0zV3mYFWxJTU1lby8vBoFU7wzxpCXl0dqavX+Yo6/3+R4Y4ydY2X2HXaU5zn3Qf9bITHZ6coc1SDNhYi3yQXsSNGEJNt9sVXtB3goaN68Obm5uezdu7fqjdVPpKam0rx582q9RgM9lhXugVm32WaWpqfD8GftepuKxAQhI81Fni/QU+vZBTU2zoHzHnS2uBiRnJxMmzZtnC4jrmiTSywyBlbPgGd6w8a5cO4DMHa+hnk5GWnJP5yhA7QfCnvX2+l/lYpCGuix5sBOeG00vHU9ZJ4C4xfb1YDisJ28KpnulHKB7hs1OteZgpSqJQ30WGEMrHoVnj0Dvl0Ag/8I182JyomzwsXjdv040DNPtv8J6jQAKkppoMeC/dvhlZHwzq+gUSf41afQ7ybbHU+dkCe9XKCDbXbZ8jEcrf3c1EqFmwZ6NDMGVkyGZ/vYRRrO/ytcMytq5iR3mifNRUFRMWVlft3q2g+B0mI7BYJSUUYDPVrt+w6mXgL/vQVO6mbPys+4EWJ4ea1g87hdlBnYf/jYDw+27Asp9bTZRUUlvVIWbcrKYMUkmGeXz+LCx6HntRrkNZCZ7hstWkyGd6ARiclwyiB7YbSsTI+riioBfVpFZKiIfC0iOSJyTwXPtxSRBSLyhYisFpELgl+qovQYvDMeZt0OzXvBrz+DXmM1dGroR/O5+Gs/FAp3wa4vHahKqZqrMglEJBF4Bjgf6AiMFpHyHZrvA2YYY04HRgHPBrvQuFdcBNN+Caunw8/ug6vehgYtna4qqmWk+QL96I+fOOU8QOwgI6WiSCCndr2BHGPMt8aYYmAaMLzcNgbwzbdaH9gRvBIVh/fBy5fCprlw0RMw4M6oXzEoEviaXPIPHfvxE+5MaNFb29FV1Akk0JsB2/y+z/U+5u8B4EoRyQVmA7+t6I1E5AYRWS4iy3V+hwAd3A2TL4Tc5TDyRTtXuQqKH5pcjv70yfZDYMcXcHBXmKtSquaC1fg6GphsjGkOXABMFZGfvLcxZqIxJtsYk52VlRWkXcew/M0wabD9+ssZdppbFTQpSYmkpyT9MJ+LP99ao5t01KiKHoEE+nbAf0HJ5t7H/I0FZgAYYz4DUoHYXLssXHZ9BZOGwJH9MGamnQ1QBV2GO/mnF0UBGneCes21HV1FlUACfRnQTkTaiIgLe9FzZrltvgMGAYhIB2yga5tKTX23BCZfAJII134AzStcPlAFgaf8fC4+IrbZ5ZsFUFJBk4xSEajKQDfGlAA3AXOA9djeLGtF5CERGebd7HbgehH5EngNuMborPY1s3EuvDQC3Fkwdg40Os3pimJaZvn5XPy1HwrHDsGWxeEtSqkaCmhgkTFmNvZip/9jE/zurwP6B7e0OLT6ddvPvHEn+OWbkK7XGULN43axfueBip9scxYk1bHNLqcMCm9hStWAjkiJFJ//C94aBy36wJj3NMzDxOO2i1xU+Adlch1oO9B2X9Q/OFUU0EB3mjGw4M/w/l1w2kVw5Zt29RwVFh63i+KSMoqKSyveoP0Q2LcV9n4d3sKUqgENdCeVlcHsO+Gjv0D3K2HkFEiu3qKwqnZOOPzf5/iiFzrISEU+DXSnlBTbVYWW/Rv6/RaG/1NXFXJApvuHCboqVK8pNOmq3RdVVNBAd0LxIZg2Gr56w673OfgRHcrvkIzKRov6tB8C25ZAUX6YqlKqZjTQw+3wPjuP+Tf/g4v/Ydf7VI7JdJ9gPhd/7YeCKYOcD8NUlVI1o4EeTmVl8OY42L4SRk6Gntc4XVHcq3Q+F5+mPSCtobajq4ingR5Oi/4KOfPg/L9Ax/ITVionpKck4UpMOHEbOtj55tsPgZz5UFoSvuKUqiYN9HDZNB8W/gW6joLssU5Xo7xExM7nUlhJoIMN9CP7IHdpWOpSqiY00MOhYCu8OdaOAL3oCb0AGmE87hQKiqoI9LY/g4RkbXZREU0DPdSOHYEZV9sBRJe/BK40pytS5WR6R4tWKrUetO6v3RdVRNNAD7X374Sdq+CS5yHzZKerURXwVDZBl7/2Q2HvBjs/vVIRSAM9lFZOhZUvwZm3wWm6bnak8rhdVbehww+jRnXRCxWhNNBDZccqmHU7tBkA59zndDWqEh63i4NHSyguKatiw7aQ2U7b0VXE0kAPhaJ8mHEVuBvCZZMgIdHpilQlfH3Rq7wwCvYsfctiOHowxFUpVX0a6MFWVgZv3wgHdtqLoG5diS/SHZ/PJZBml9MuhNJiWP9eiKtSqvo00INt0WO2jXXon3XpuCiRUZ0z9JZ9bbPL8v+EuCqlqk8DPZg2zYeFf4auV0CvcU5XowJU5YyL/kTsv23uMnudRKkIooEeLAVb7YpDjTrCRU/q4KEocnw+l8IAF4PuNgqS0/QsXUUcDfRg8A0eKiuFK6bq4KEo0yDNhUgli1yUV6cBdBlp14A9vC+UpSlVLRrowfD+XTp4KIolJggN6iSTH0gbuk+vcVByGL58LXSFKVVNGui19cXLsHKKndf8tAudrkbVUMCjRX1O6grNe8OyF3QBaRUxNNBrY+dq7+Chs+FnOngommW6UwLrtuiv11jIy4HNH4WmKKWqSQO9pg4XwPQroY4Hfj5J1wONctU+QwfoOML++y97ISQ1KVVdGug1UVYGb90IB3bYwUPpWU5XpGopw+0KrB+6v+RU6HEVbJgN+7eHpjClqkEDvSY+/jtsmmMHD7Xo5XQ1Kggy3S4Kio5RVlbN9vCe19r1RldOCU1hSlWDBnp1bf4YFvwRulyug4diiMftorTMsP9wJYtFV/jCNtDuPFgxBUqr+VqlgkwDvTrKSuH9u6FBS7j4SR08FEMy06sxWrS8XuOgcBdsmBXkqpSqHg306lj1CuxZC+c9CC6309WoIMpIq8Z8LuWdcq79T14vjiqHaaAH6mgh/O+Ptu9xxxFOV6OCzFOdGRfLS0iE7Otgy8ew9+sgV6ZU4AIKdBEZKiJfi0iOiNxzgm0uF5F1IrJWRF4NbpkR4NOn7Z/VQ/6oTS0xyNfkUu2uiz6nXwWJLlim87so51QZ6CKSCDwDnA90BEaLSMdy27QD7gX6G2M6AbcGv1QHHdgJnz4FnS6BFr2drkaFgK/JJf9QgBN0leduaD8fX75m/5pTygGBnKH3BnKMMd8aY4qBacDwcttcDzxjjCkAMMbsCW6ZDlvwCJSVwKD7na5EhUhqciJuVyL5h2rRU6XXODh6ANa8HrzClKqGQAK9GbDN7/tc72P+2gPtReQTEVkiIkMreiMRuUFElovI8r1799as4nDbtQa+eAV632C7qKmY5Ul31fwMHaB5L2jSRed3UY4J1kXRJKAdMBAYDfxbRBqU38gYM9EYk22Myc7KioLRlcbA3PvsdKln3+F0NSrEPO6UmnVb9BGB7LGw+yvYtjR4hSkVoEACfTvQwu/75t7H/OUCM40xx4wxm4GN2ICPbjnz4duFMOBuqJPhdDUqxDxpyTW/KOrTZSSk1NMujMoRgQT6MqCdiLQRERcwCphZbpt3sGfniEhDbBPMt8Er0wGlJfbs3NPWnnWpmOdxp1BQ20BPSYduo2HdO1AYJc2KKmZUGejGmBLgJmAOsB6YYYxZKyIPicgw72ZzgDwRWQcsAO40xuSFquiw+GIq7N0A5z4ISS6nq1FhkJnuIu9QMaa27d+9xkJpsf0MKRVGAc35aoyZDcwu99gEv/sGuM17i35HD8KCP9kV3jtc7HQ1Kkw8bhdHS8ooKi7FnVKL6ZCzToXWZ8HyF6H/LXbgkVJhoCNFK/LJP+DQHhj8iA4iiiOetFoOLvLXaxzs/w42zav9eykVIA308vZvh0//CZ1/Ds2zna5GhZFv+H9QAv20CyG9iV4cVWGlgV7e/x4BU6qDiOKQp7bD//0lJkPPa2xPqfzNtX8/pQKgge5v55d26PYZ4yGjldPVqDDLdNdiCt2K9BwDkgArXgzO+ylVBQ10H2Ngzu9tf/Ozbne6GuWADHct53Mpr15T2/SyciocOxKc91SqEhroPhvn2OlPB95jR4aquFM3JYnkRKndfC7l9RoHh/Ntv3SlQkwDHewgonl/AM/Jdl5rFZdEBI+7lvO5lNfmbMhspxdHVVhooAOsnAzfb4TzHrIXs1Tc8rhTgnNR1EfEDjTKXQY7VgXvfZWqgAb6kQOw4M/Qqr9t71RxzeNODt5FUZ9uoyGpDizXxS9UaGmgL34Cir7XQUQKCNJ8LuXVaQBdR8Lq1+HwvuC+t1J+4jvQ922DJc9Cl8uhWQ+nq1ERINPtCv4ZOtgJ3koO226xSoVIfAf6/x623RUHTah6WxUXPG4XB4+UUFxSFtw3btrdLoChi1+oEIrfQN++ElZPh76/hgYtqt5exQVfX/R9RSE4S+81DvJyYPNHwX9vpYjXQDcG5v4B0hrCmbExQaQKjqCPFvXXcQTU8WgXRhUy8RnoX8+GrYvtIKLUek5XoyJIUCfoKi85FXpcBRtm2+s3SgVZ/AV66TGYNwEatreTJynlJ6Rn6GCbXRIS7SRwSgVZ/AX6qldtO6YOIlIV8LWhB73rok+DltD3Jlg9DXKXh2YfKm7FX6Av/w807gzthzpdiYpAGWkuREJ4hg5w1m12rvT374ayIPemUXEtvgJ9+0o7RW7Pa3QQkapQYoLQoE5ycOdzKS+lLpx7P2xfDmtmhG4/Ku7EV6CvmGyHYHe93OlKVASzE3SF8AwdoOsoaNoD5t0PRwtDuy8VN+In0I8ehDVv2KXlUus7XY2KYGEJ9IQEOP9RKNwFix8P7b5U3IifQF/zBhw7BNnXOl2JinBhCXSAFr2h6xV2DduCLaHfn4p58RPoKybbi6HNejpdiYpwQZ9CtzLnPmC7Mc79Q3j2p2JafAT6ji9g5yq9GKoCkul2UVB0jLKyMMy5Uq+p7fWyfiZsXhT6/amYFh+BvmKKvRjaZaTTlagokOF2UVpmOHAkiEvRVabvTVC/JXxwr109S6kaiv1AP3oQ1rwOnS/VtUJVQEI+WrS85Dow+GHY/RWsnBKefaqYFPuB/tWbUFyow/xVwEI6n8uJdBwOrc60UwIcLgjfflVMif1AXzEZGnW0c1ErFQBfoOcVhjHQRWDon+HIPvjor+Hbr4opsR3oO1bZC6I9r9WLoSpgvkAvCMWc6JU5qSv0GANLJ8Ler8O7bxUTYjvQV06BpFQdGaqqxZEmF59z7oNkt71AqisbqWqK3UA/WmgX5e2kF0NV9aQmJ+J2JYa3ycXH3RAG3g3ffAib5oZ//yqqBRToIjJURL4WkRwRuaeS7X4uIkZEsoNXYg2tfQuKD+rFUFUjnnRXaCfoqkyv6yGzHcz5Pyhx4D8VFbWqDHQRSQSeAc4HOgKjRaRjBdvVBW4BPg92kTWy/EXI6mCHVytVTZ40F/lFYeqHXl6Sy14gzcux7elKBSiQM/TeQI4x5ltjTDEwDRhewXYPA48CR4JYX83s/BJ2rNSRoarG7HwuDp2hA7Q7D9oNho8ehcK9ztWhokoggd4M8F8AMdf72HEi0gNoYYyZVdkbicgNIrJcRJbv3RvCD+kK78XQbleEbh8qpnncKeQ70Ybub8if4FgR/O9hZ+tQUaPWF0VFJAF4HLi9qm2NMRONMdnGmOysrKza7rpixYdg9QzodAnUyQjNPlTMy0x3kXeoGONkT5OG7aD3jbDyJftXp1JVCCTQtwMt/L5v7n3Mpy7QGVgoIluAPsBMxy6MfqUXQ1XtZaS5OFpSxuFjpc4WMuAuSPNoN0YVkEACfRnQTkTaiIgLGAXM9D1pjNlvjGlojGltjGkNLAGGGWOcWQF3xWTIOg1anOHI7lVsyHRitGhF6jSAc/4AWz+Bde84W4uKeFUGujGmBLgJmAOsB2YYY9aKyEMiMizUBVbLrjV2nUa9GKpqydHBReX1uBoad7Fzph877HQ1KoIF1IZujJltjGlvjDnZGPNH72MTjDEzK9h2oHNn51MgMcWuAqNULXjSIyjQExLh/L/A/m3w6dNOV6MiWOyMFC0+BKunQ6cRts1RqVrwpEVQoAO0PhM6joDFT8D+7VVuruJT7AT62rfh6AE7EZdStRRRZ+g+5z0EZaUw/wGnK1ERKnYCfcVkaHgqtOzjdCUqBtRNSSI5UcK3yEUgMlpB/5thzQzI+dDpalQEio1A3/UV5C7Ti6EqaETE+dGiFTnzd3Z+/zfHQv5mp6tRESY2An2l92Jot1FOV6JiSEaai/xDDs3nciIuN4x6BUwZTPulnVVUKa/oD/TiIvhyul3CSy+GqiDKdHLGxcp42sJlk2Dvenj31zrgSB0X/YG+7h04ul9Hhqqg87hTIuuiqL9TzoVzH4B178Lix52uRkWI6A/05S9Cw/bQqp/TlagYk+l2RW6gA/S7GTpfBh8+DBvnOF2NigDRHei710LuUr0YqkIiI83FgSMlHCstc7qUionAsKehSWd4cxx8v8npipTDojvQV0yBRBd0G+10JSoG+fqiF0TyWborDUa9ConJMO0XcOSA0xUpB0VvoBcXweppejFUhczxCboiOdABGrSEkVMg7xt46wYoi9C/KFTIRW+gr3sXjujFUBU6vgm6IvoM3afNWXbZuo3vw0d/cboa5ZDoDfQVkyHzFGjV3+lKVIzyRMsZuk/vG6D7L+2ydev/63Q1ygHRGeh71sO2JXoxVIVURE2hGwgRuPBxaNYT3h5vf09UXInOQF8x2Xsx9BdOV6JiWEaaC5EoOkMHSE6FK16G5DR7kfRwgdMVqTCKvkA/dhi+fA06DAN3ptPVqBiWmCA0qJMcHW3o/uo1hSumwr5t8MZYO0OjigvRF+h6MVSFUUakDy46kZZ94ILH4JsP4cOHnK5GhUmS0wVUW1pD6PxzO+G/UiGW6XaRF4nzuQQi+1rYtRo+eRKadIEulzldkQqx6Av0dufam1Jh4HG72Pz9IafLqLmhj8LudfDuTXaKjJO6Ol2RCqHoa3JRKozsBF0RNoVudSS54PKXoE6GnW73UJ7TFakQ0kBXqhIedzIFRcWUlUXxFLV1G8Ool6FwN7w+Bkqj+D8oVSkNdKUq4XGnUFpmOHAkykOwWU+4+B+w5WOY+wenq1EhEn1t6EqFkf98Lg3SXA5XU0vdR8POL+Hz5+z8R2ffqQPzYowGulKV+NF8LlkOFxMMgx+Bw/mw4I9wYDtc8HdI1BiIFfovqVQlom4+l6okJsEl/7KDjxY/AQd32+XsXGlOV6aCQNvQlapE1M3nEggRu3zdBX+DjR/AS8O090uM0EBXqhIxGeg+va+3UwTsWgP/OQ/yNztdkaolDXSlKpGanIjblRibgQ7Q4WK4+l0oyrOhvuMLpytStaCBrlQVonY+l0C17ANj50JSHXjxQtg03+mKVA1poCtVBTufSwwHOkDWqTBuHmS2hdeugC9ecboiVQMBBbqIDBWRr0UkR0TuqeD520RknYisFpEPRaRV8EtVyhket4v8aJ2gqzrqNoFrZtuJ7979NSx6DEwUj5CNQ1UGuogkAs8A5wMdgdEi0rHcZl8A2caYrsAbwF+DXahSTvG4UyiI5vlcqiO1Hvzideh6BfzvEXjvd1Ba4nRVKkCBnKH3BnKMMd8aY4qBacBw/w2MMQuMMUXeb5cAzYNbplLO8biTo3cK3ZpIctm+6mf+Dla8CDOuguKiql+nHBdIoDcDtvl9n+t97ETGAu9X9ISI3CAiy0Vk+d69ewOvUikHedwpHDlWRlFxHJ2p+vdV//p97aseJYJ6UVRErgSygccqet4YM9EYk22Myc7KioVx1CoeHJ/PpTDGL4xWRPuqR5VAAn070MLv++bex35ERM4Ffg8MM8bE0d+nKtYdn8+lKA4DHbSvehQJJNCXAe1EpI2IuIBRwEz/DUTkdOBf2DDfE/wylXJORqzN51ITx/uqp9q+6iunag+YCFRloBtjSoCbgDnAemCGMWatiDwkIsO8mz0GpAOvi8gqEZl5grdTKur4mlzy47HJxV/WqTB2nl3GbuZNMOViyPvG6aqUn4BmWzTGzAZml3tsgt99XeRTxSxPegzP51Jd9U6yfdVXToZ598Nz/WDA3dDvt5CY7HR1cU9HiipVhbopSSQnCvnx2oZeXkICZF8Hv1kK7c6DDx+EiT+D7SucrizuaaArVQURISPNpU0u5dU7Ca542d4O7YUXzoUP7oWjhU5XFrc00JUKgCce5nOpqQ4Xw01Loec1sORZeLYvbJrndFVxSQNdqQBkprvit9tiIFLrw0VPwLUfQHIqvHIZvDkOCnUAYThpoCsVAI87RS+KBqJVXxi/GAbcA2vfgWd6wapXtYtjmGigKxUAT1oyeYU6Xi4gSSnws3ttsDdsD+/8CqaOgPxvna4s5mmgKxUAjzuFA0dKOFZa5nQp0aPRabYJ5sK/Q+4KeLYfLH5SZ28MIQ10pQLg64uu7ejVlJAAvcbZi6anDIL598O/B8LGudoMEwIa6EoFIDOWF4sOh3pNYdQrcPlUKCqAV0fa3jBfvAIlekyDRQNdqQBkpOnw/6DoOAxu/sLOty4JdmWkf3SFxU/A4X1OVxf1NNCVCkBmuk7QFTRJLug2Cn71CVz5pp0jZv4D8ERnmPN72J/rdIVRSwNdqQDE/RS6oSACp5xrp+a9cRGcOhSWPAf/6AZv3WDnYFfVooGuVAAa1ElGJE4XuQiHk7rBz1+AW1ZB7xtg/Xvw/Jkw9RL4ZoFeQA2QBrpSAUhKTKB+nWS9KBpqDVrC0D/DbWth0ATYvdb2Yf/XWbB6BpTGyWLdNaSBrlSAPG6XBnq41MmAs26HW9fAsH/anjBvXQ9PnQ6fPQOHvne6woikga5UgDI10MMvKQV6XAW/XgKjp9sz+Dn/B39rZxfYWPYCHNztdJURI6AFLpRS9gx9y/dFTpcRnxIS7EXTU4fai6Vr34F178Ks22HWHdCyL3Qcbmd+rN/M6Wodo4GuVIA8bhcrtu5zugzVpIu9nXMf7N1gg33du/DB3fbWvLft795hGGS0crrasNJAVypAHredQreszJCQIE6Xo0SgUQd7G3gPfL/ph3Cfe5+9NT3de+Y+DDJPdrrikNNAVypAHncKpWWGg0dKqJ+m62dGnIbt4Ow77C1/M6yfacN9/gP21riLDfeOw+wskBJ7/ylroCsVIN98LnmHjmqgRzpPG+h/i73t+w7W/xfWzYQFj9hb3ZOgZR/b9t6yDzTuDAmJTlddaxroSgUow2+CrrZZDhejAtegJfT9jb0d2AFfz4atn8F3n8Hat+02rrrQotcPAd8sG1xpztZdAxroSgXohzN07boYteo1tdP59hpnv9+3Db5bYsP9uyWw4E+AgYQkOKn7j8/i3Q2drDwgGuhKBej4fC4a6LGjQQt76zrSfn+4ALYtg21LbMAv/Td89k/7XGY7aHmGDfgmXWw7fHId52qvgAa6UgHy6Bl67KuTAe0H2xtAyVHY+eUPZ/AbZsEXL9vnJAEyWkNWB7s6k+9rZju7ULYDNNCVClBqciJprkQdLRpPklKgRW97638LlJVBXg7sWQt7NsDe9fbrpjlQ5l1aTxIgo43tTpl12g9fG7az7xfKckP67krFGJ3PJc4lJEBWe3vr5Pd4SbENel/A+75+/T6YUruNJIKnrT2L7zUO2g4Menka6EpVg87noiqU5ILGHe3NX8lRO+Bp7wbYs95+3b0uZJOLaaArVQ0ZbpfOia4Cl5QCTTrbWxjobItKVYM2uahIpoGuVDVkul3kHTrqdBlKVSigQBeRoSLytYjkiMg9FTyfIiLTvc9/LiKtg16pUhHA407hyLEyDheXOl2KUj9RZaCLSCLwDHA+0BEYLSLlWv4ZCxQYY04BngAeDXahSkUCj9vO4aJn6SoSBXJRtDeQY4z5FkBEpgHDgXV+2wwHHvDefwP4p4iIMbqyq4otHrftR/yLf39OSpK2WKqauXlQOy7u1jTo7xtIoDcDtvl9nwuccaJtjDElIrIfyAR+1DdHRG4AbgBo2bJlDUtWyjm9W3v4eY/mHD5W4nQpKorVrxOa2TrD2m3RGDMRmAiQnZ2tZ+8q6tRPS+bvl3dzugylKhTI34zbgRZ+3zf3PlbhNiKSBNQH8oJRoFJKqcAEEujLgHYi0kZEXMAoYGa5bWYCY7z3LwP+p+3nSikVXlU2uXjbxG8C5gCJwCRjzFoReQhYboyZCfwHmCoiOUA+NvSVUkqFUUBt6MaY2cDsco9N8Lt/BBgZ3NKUUkpVh/a7UkqpGKGBrpRSMUIDXSmlYoQGulJKxQhxqnehiOwFttbw5Q0pNwo1wmh9taP11V6k16j11VwrY0xWRU84Fui1ISLLjTHZTtdxIlpf7Wh9tRfpNWp9oaFNLkopFSM00JVSKkZEa6BPdLqAKmh9taP11V6k16j1hUBUtqErpZT6qWg9Q1dKKVWOBrpSSsWIiA70SF6cWkRaiMgCEVknImtF5JYKthkoIvtFZJX3NqGi9wphjVtEZI1338sreF5E5Cnv8VstIj3CWNupfsdllYgcEJFby20T9uMnIpNEZI+IfOX3mEdE5onIJu/XjBO8dox3m00iMqaibUJQ22MissH77/e2iDQ4wWsr/SyEuMYHRGS737/jBSd4baW/7yGsb7pfbVtEZNUJXhuWY1grxpiIvGGn6v0GaAu4gC+BjuW2+TXwvPf+KGB6GOs7CejhvV8X2FhBfQOB9xw8hluAhpU8fwHwPiBAH+BzB/+td2EHTDh6/ICzgR7AV36P/RW4x3v/HuDRCl7nAb71fs3w3s8IQ22DgSTv/Ucrqi2Qz0KIa3wAuCOAz0Clv++hqq/c838HJjh5DGtzi+Qz9OOLUxtjigHf4tT+hgNTvPffAAaJiISjOGPMTmPMSu/9g8B67Nqq0WQ48JKxlgANROQkB+oYBHxjjKnpyOGgMcYsws7p78//czYFGFHBS4cA84wx+caYAmAeMDTUtRlj5hpjfAucLsGuKOaYExy/QATy+15rldXnzY7LgdeCvd9wieRAr2hx6vKB+aPFqQHf4tRh5W3qOR34vIKn+4rIlyLyvoh0Cm9lGGCuiKzwLtBdXiDHOBxGceJfIiePn09jY8xO7/1dQOMKtomEY3kd9i+uilT1WQi1m7zNQpNO0GQVCcfvLGC3MWbTCZ53+hhWKZIDPSqISDrwJnCrMeZAuadXYpsRugFPA++EubwzjTE9gPOB34jI2WHef5XELms4DHi9gqedPn4/Yezf3hHX11dEfg+UAK+cYBMnPwvPAScD3YGd2GaNSDSays/OI/73KZIDPeIXpxaRZGyYv2KMeav888aYA8aYQu/92UCyiDQMV33GmO3er3uAt7F/1voL5BiH2vnASmPM7vJPOH38/Oz2NUV5v+6pYBvHjqWIXANcBPzS+x/OTwTwWQgZY8xuY0ypMaYM+PcJ9u3oZ9GbH5cC00+0jZPHMFCRHOgRvTi1t73tP8B6Y8zjJ9imia9NX0R6Y493WP7DERG3iNT13cdePPuq3GYzgau9vV36APv9mhbC5YRnRU4ev3L8P2djgHcr2GYOMFhEMrxNCoO9j4WUiAwF7gKGGWOKTrBNIJ+FUNbof13mkhPsO5Df91A6F9hgjMmt6Emnj2HAnL4qW9kN2wtjI/bq9++9jz2E/fACpGL/VM8BlgJtw1jbmdg/vVcDq7y3C4DxwHjvNjcBa7FX7JcA/cJYX1vvfr/01uA7fv71CfCM9/iuAbLD/O/rxgZ0fb/HHD1+2P9cdgLHsO24Y7HXZT4ENgHzAY9322zgBb/XXuf9LOYA14apthxs27PvM+jr9dUUmF3ZZyGMx2+q9/O1GhvSJ5Wv0fv9T37fw1Gf9/HJvs+d37aOHMPa3HTov1JKxYhIbnJRSilVDRroSikVIzTQlVIqRmigK6VUjNBAV0qpGKGBrpRSMUIDXSmlYsT/A1TrnmwRgEC3AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
          " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import math\n", + "\n", + "## Compute the response for a given action and current temperature\n", + "def respond(action, current_temp, tau):\n", + " return action + (current_temp - action) * math.exp(-1.0/tau)\n", + "\n", + "## Actions of a series of on, then off\n", + "sAction = pd.Series(np.array([1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0]))\n", + "sResponse = np.zeros(sAction.size)\n", + "\n", + "## Update the response with the response to the action\n", + "for i in range(sAction.size):\n", + " ## Get last response\n", + " if i == 0:\n", + " last_response = 0\n", + " else:\n", + " last_response = sResponse[i - 1]\n", + " sResponse[i] = respond(sAction[i], last_response, 3.0)\n", + "\n", + "## Assemble and plot\n", + "df = pd.DataFrame(list(zip(sAction, sResponse)), columns=['action', 'response'])\n", + "df.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Goal and Reward\n", + "The goal here is to make an agent that will take actions that will keep the temperature between 0.4 and 0.6.\n", + "\n", + "We make a reward function to reflect our goal. When the temperature is between 0.4 and 0.6, we set the reward as 0.0. When the temperature is outside of this band, we set the reward to be the negative distance the temperature is from its closest band. So if the temperature is 0.1, then the reward is -(0.4 - 0.1) = -0.3, and if it is 0.8, then the reward is -(0.8 - 0.6) = -0.2.\n", + "\n", + "Let's chart the reward vs. temperature to show what is meant:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Reward vs. Temperature')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAt8AAAEWCAYAAAC+BfslAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAr1ElEQVR4nO3debhkdX3v+/dHBu04NUMHobFFIzZxCoQW5RgjXtE2HiNoDGocWiMSYozRHPsIepJwk3jFSwwZzDGiVxGNijEE+jikH8ZojKiNoKCxA0EJNA20YEfFFhm+949aG6s3VXvX3r1rfr+ep56uWuu3av32rqq1vr3271O/VBWSJEmS+u9+w+6AJEmSNC0sviVJkqQBsfiWJEmSBsTiW5IkSRoQi29JkiRpQCy+JUmSpAGx+JakEZfkVUn+Zdj9kCTtOotvSVMtyXeS7EjywyQ3JTkzyYOG3a9hSbKq+V3M3CrJ7W2PnzbsPi5G83M8etj9kCSLb0mCX62qBwGHAocBJw+rI0l2H9a+AarqP6vqQTO3ZvEvtC37/DD718kgfmdJduv3PiRNB4tvSWpU1U3ARlpFOABJnpLkX5NsT/K1JEc1y5+R5Mq2ducn+Urb488nOba5f1KS/0jygyTfTPKCtnavSvKFJKcnuRU4Jck+STYk+X6SLwM/163PST6b5PWzln0tyQvTcnqSW5rnujLJ4xf7+0ly/yR/luQ/k9yc5G+TLGvWHZXkhiT/s9nf1iTHJnlukn9PcluSt7Y91ylJPpnk7Ob38tUkv9C2/oAk/5BkW5JvJ3lDh20/kuT7wKuSHJHki83rtDXJu5Ps2bT/XLPp15qr9y/uNJSn/ep48xeQ9yT5TJLbgWfM1SdJ6pXFtyQ1khwI/ApwTfN4JfBp4E+BvYE3A/+QZAVwKXBwkn2T7AE8ETggyYObgnQNMHOV+D+ApwEPBf5v4CNJ9m/b9ZOBa4H9gLcDfwP8GNgf+M3m1s3HgJe2/QyPBR7R9PvZwC8Dj2n2fRxw64J/MT91avNchwKPBlYCf9i2/mHAA9qWvw94OXA4rZ//D5I8sq39McDf0/rdfhQ4N8keSe4H/B/ga81zPRN4Y5K1s7b9JLAc+DvgbuBNwL7Akc02rwOoql9utpm5gn92jz/vb9B6PR4M/GsPfZKkeVl8S1Kr6PsBcD1wC/BHzfKXA5+pqs9U1T1VdT6wCXhuVe0AvkKruD2cVlH2BeCpwFOAq6vqVoCq+vuqurF5jrOBq4Ej2vZ/Y1X9dVXdBfwE+DXgD6vq9qq6CvjQHH3/R+DQJI9oHr8MOKeq7gDupFU4HgKkqv6tqrYu5heUJMAJwJuq6raq+gHw/wAvaWt2J/D2qroT+DitQvgvq+oHVfUN4JvAL7S1v6yqPtm0/3NahftTgCcBK6rqj6vqJ1V1La1Cvn1fX6yqc5vf6Y6quqyqLq2qu6rqO8B7gacv5mdtc15VfaGq7gGe0EOfJGleQx1bKEkj4tiquiDJ02ldgd0X2E7rCvKvJ/nVtrZ7ABc39/8ZOAq4obn/PVoF3x3NYwCSvBL4feCgZtGDmn3MuL7t/gpax+b2Zdd163hV/SDJp2kVge+kdRX8tc26i5K8m9aV9EckOQd4c1V9v/uvoqsVwM8Al7Xq8NaPBrSPhb61qu5u7u9o/r25bf0OWj/7jHt/xqq6J8kNwAFA0forwva2trvx078k7LQtQJLH0Crg1zT93B24rMefrZv2fTyihz5J0ry88i1Jjar6Z+BM4M+aRdcDH66q5W23B1bVqc36meL7l5v7/0yr+H56c5/mivT7gNcD+1TVcuAqWoXrvbtuu78NuAt4eNuyVfN0/WPAS5McSevq8cx/Dqiqv6qqw4HH0hoysn6e5+rmu7SK58e1/S4e2hbKXIx7f8ZmqMmBwI20fu/fnvV7f3BVPbdt25r1XO8BvgUcXFUPAd7Kzr/j2W6nVaTP7P9hHdq076OXPknSvCy+JWlnfwE8qwn/fQT41SRrk+yW5AFNsPDApu2/AqtpDSH5cjO04hG0xnDPhPweSKuI2waQ5NVA19Bjc+X4HFrBy59pxnCvm6fPn2n2+8fA2c0wCZI8KcmTmzHpt9MaR37PAn4X7f26h9Z/Ik5P8rPN86/cxTHPhzfB0N2BN9L6i8GlwJeBHyR5S5Jlze/+8UmeNMdzPRj4PvDDJIcAvz1r/c3Ao9oefw14XJJDkzwAOGWevi6mT5J0HxbfktSmqrYBZ9Eac309rWDfW2kVz9fTunJ8v6bt7cBXgW9U1U+ap/gicF1V3dK0+Sbwrmb5zbTGDn9hnm68ntbwjJtoXYn/4Dx9voNWwX40rWEzMx5Cq2D+Hq2hK7cCpwEkeWuSz87Tj9neQiuMemnzLSMX0PrPx2KdB7y46d8rgBdW1Z3Nf0CeRyvY+W1aV93fTys02s2baQUkf0DrZ54dqjwF+FDzbSjHVdW/0/rPygW0xuDPOYnRIvskSfeRqtl/uZMkqb+SnAI8uqpePuy+SNIgeeVbkiRJGhCLb0mSJGlAHHYiSZIkDYhXviVJkqQBmapJdvbdd9866KCDht0NSZIkTbDLLrvsu1W1otO6qSq+DzroIDZt2jTsbkiSJGmCJek6M7HDTiRJkqQBsfiWJEmSBsTiW5IkSRoQi29JkiRpQCy+JUmSpAEZ6redJHkO8JfAbsD7q+rUWevvD5wFHA7cCry4qr7TrDsZeA1wN/CGqto4wK5LmlLnXr6F0zZu5sbtOzhg+TLWr13NsYet7Lp8MdsMa7l97X3fkrRYQ5vhMsluwL8DzwJuAL4CvLSqvtnW5nXAE6vqxCQvAV5QVS9O8ljgY8ARwAHABcBjquruufa5Zs2a8qsGJS3WuZdv4eRzrmTHnT891CzbYzd+7fCV/MNlW+6z/B0vfALAgrYZ1nL72vu+LcAlzSfJZVW1puO6IRbfRwKnVNXa5vHJAFX1jrY2G5s2X0yyO3ATsAI4qb1te7u59mnxLWlXPPXUi9iyfcd9lu+WcHeHY+nK5csAFrTNsJbb1973/YWT/q/7LJekdnMV38McdrISuL7t8Q3Ak7u1qaq7kvwXsE+z/NJZ23a8FJHkBOAEgFWrVi1JxyVNpxs7FHtAxyJtrvZzbTOs5fZ11/ctSb2Y+MBlVZ1RVWuqas2KFR1n+ZSknhzQXHGdbbeka/uFbjOs5fa1931L0q4YZvG9BXh42+MDm2Ud2zTDTh5KK3jZy7aStGjnXr6Fp556EY886dM89dSLOPfyLaxfu5ple+y2U7tle+zGS5/88I7L169dveBthrXcvva+707vDUnq1TCHnXwFODjJI2kVzi8BfmNWmw3AOuCLwIuAi6qqkmwAPprkz2kFLg8GvjywnkuaaLODlVu27+Dkc67kHS98Au944RM6fgPGmkfsPec3Yyxkm2Ett6/zLwc6vjcAg5iSejK0wCVAkucCf0HrqwY/UFVvT/LHwKaq2pDkAcCHgcOA24CXVNW1zbZvA34TuAt4Y1V9dr79GbiU1ItuwUrDdvK9IakXoxq4pKo+A3xm1rI/bLv/Y+DXu2z7duDtfe2gpKnULVRn2E6+NyTtqokPXErSQnUL1Rm2k+8NSbvK4lvSVFtIsHJmzK+m11zvDYOYknph8S1pas0EK7ds30Gxc3juHS98AiuXLyO0xvM6s6GgFars9N4AOr6XLMAlzTbUwOWgGbiU1M7wnJaK7yVJ7eYKXHrlW9LUMjynpeJ7SVKvLL4lTS3Dc1oqvpck9criW9JUMFipfprvvWQYU9IMi29JE89gpfqtWxDz2MNWdn3/WYBL08nApaSJZxhOw+T7T5o+Bi4lTTXDcBom33+S2ll8S5p4huE0TL7/JLWz+JY0UQxWatQ4K6akdhbfkiaGwUqNImfFlNTOwKWkiWGwTePE96s0uQxcSpoKBts0Tny/StPJ4lvSxDDYpnHi+1WaTkMpvpPsneT8JFc3/+7Vpd26ps3VSda1Lb8kyeYkVzS3nx1c7yUNW7eQmsFKjRODmNJ02n1I+z0JuLCqTk1yUvP4Le0NkuwN/BGwBijgsiQbqup7TZOXVZUDuKUpMxOq3HHn3cDOocqZAOVpGzdz4/YdHLB8GevXrjZYqZHU7f0KzPselzS+hhK4TLIZOKqqtibZH7ikqlbPavPSps1vNY/f27T7WJJLgDcvtPg2cCmNP0NqmnS+x6XxN4qBy/2qamtz/yZgvw5tVgLXtz2+oVk244PNkJM/SJJuO0pyQpJNSTZt27ZtlzsuabgMqWnS+R6XJlvfiu8kFyS5qsPtmPZ21br0vtDL7y+rqicAT2tur+jWsKrOqKo1VbVmxYoVC/45JI0WQ2qadL7HpcnWtzHfVXV0t3VJbk6yf9uwk1s6NNsCHNX2+EDgkua5tzT//iDJR4EjgLOWqOuSRsS5l2+5z3jY9WtX7zQeFgxVarLM9R7v9JlwHLg0XoY17GQDMPPtJeuA8zq02Qg8O8lezbehPBvYmGT3JPsCJNkDeB5w1QD6LGmAnK1S08oZMaXJNqzA5T7AJ4BVwHXAcVV1W5I1wIlVdXzT7jeBtzabvb2qPpjkgcDngD2A3YALgN+vqrtn72c2A5fS+DB0Ju3Mz4Q0PuYKXA7lqwar6lbgmR2WbwKOb3v8AeADs9rcDhze7z5KGi5DZ9LO/ExIk8EZLiWNJENn0s78TEiTweJb0tB1ms3P2SqlnTkjpjQZLL4lDZXBSqk3BjGlyTCUwOWwGLiURo8hMmnX+BmSRs8oznApSYAhMmlX+RmSxovFt6ShMkQm7Ro/Q9J4sfiWNDAGK6WlZxBTGi8W35IGwmCl1B8GMaXxYuBS0kAYCpMGy8+cNDwGLiUNnaEwabD8zEmjyeJb0kAYCpMGy8+cNJosviUtOYOV0vDN95kzjCkNh8W3pCVlsFIaDd2CmMcetrLr59QCXOo/A5eSlpQhL2n0+TmV+svApaSBMeQljT4/p9LwWHxLWlKGvKTR5+dUGp6hFN9J9k5yfpKrm3/36tLun5JsT/KpWcsfmeRLSa5JcnaSPQfTc0ntDFZK48lZMaXhGdaV75OAC6vqYODC5nEnpwGv6LD8ncDpVfVo4HvAa/rSS0ldGayUxpezYkrDM5TAZZLNwFFVtTXJ/sAlVdXxsliSo4A3V9XzmscBtgEPq6q7khwJnFJVa+fbr4FLaekY2JImj59raWmMYuByv6ra2ty/CdhvAdvuA2yvqruaxzcAXS+pJTkhyaYkm7Zt27a43kq6DwNb0uTxcy31X9+K7yQXJLmqw+2Y9nbVuvTet8vvVXVGVa2pqjUrVqzo126kqWNgS5o8fq6l/tu9X09cVUd3W5fk5iT7tw07uWUBT30rsDzJ7s3V7wMBB6NJfXTu5Vs4beNmbty+gwOWL2P92tWsX7uak8+5kh133n1vO4OV0nib63Pd6ThglkNauGENO9kArGvurwPO63XD5kr5xcCLFrO9pIUxWClND4OYUv8NK3C5D/AJYBVwHXBcVd2WZA1wYlUd37T7PHAI8CBaV7xfU1UbkzwK+DiwN3A58PKqumO+/Rq4lBbOAJYkjwPSwswVuOzbsJO5VNWtwDM7LN8EHN/2+Gldtr8WOKJvHZR0LwNYkjwOSEvHGS4lzckAliSPA9LSsfiWBHSerRLmnglP0nRwRkxp6Qxl2Imk0TITqpz5hoP2UOVMgNJvOZCmV7fjADDvsUPSzoYSuBwWA5dSZ4apJC2Gxw6ps1Gc4VLSCDFMJWkxPHZIC+ewE2nKdJoo44DlyzpevTJMJWkucx07nJRH6swr39IU6TZhzjMOWWGoUtKCdQtiPuOQFU7KI3Vh8S1NkdM2bt5p2miAHXfezcXf2uZslZIWrNuMmBd/a1vHY81pGzcPp6PSCHHYiTRF5hqfeexhKy22JS1Yp2PHm86+omNbx4JLXvmWpooTZUgaBI81UncW39KE6jTxhRPmSBoEJ+WRurP4liZQt2Al4NhuSX3XbSw4YBBTU89JdqQJ5MQXkkaRxyZNCyfZkaaME19IGkUemySLb2kiGXaSNIo8NklDKr6T7J3k/CRXN//u1aXdPyXZnuRTs5afmeTbSa5obocOpOPSCDJYKWlcGMSUhnfl+yTgwqo6GLiwedzJacAruqxbX1WHNrcr+tBHaeQZrJQ0TgxiSkMKXCbZDBxVVVuT7A9cUlUdL8klOQp4c1U9r23ZmcCnquqTC9mvgUtNGsNLkiaBxzJNmlEMXO5XVVub+zcB+y3iOd6e5OtJTk9y/26NkpyQZFOSTdu2bVtUZ6VRZXhJ0iTwWKZp0rfiO8kFSa7qcDumvV21Lr0v9PL7ycAhwJOAvYG3dGtYVWdU1ZqqWrNixYqF/hjSSDO8JGkSeCzTNNm9X09cVUd3W5fk5iT7tw07uWWBzz1z1fyOJB8E3rwLXZXGwrmXb+G0jZu5cfsODli+jPVrV7N+7WpOPudKdtx5973tDFZKGjfzHcs6Hf/MsGhcDWvYyQZgXXN/HXDeQjZuCnaSBDgWuGopOyeNGoOVkiZZtyDmsYet7Hr8M4ypcTWswOU+wCeAVcB1wHFVdVuSNcCJVXV80+7ztIaXPAi4FXhNVW1MchGwAghwRbPND+fbr4FLjSvDSJKmlcc/jaO5Apd9G3Yyl6q6FXhmh+WbgOPbHj+ty/Z+2jRVDCNJmlYe/zRpnOFSGgOGkSRNK49/mjQW39KIccZKSfopZ8XUpLH4lkaIwUpJ2pmzYmrSzBm4TPKLc21cVV9d8h71kYFLjTqDRZLUG4+XGmW7Erh8V/PvA4A1wNdofcPIE4FNwJFL1UlJBoskqVceLzWu5hx2UlXPqKpnAFuBX2xmijwcOAzw7zrSEjNYJEm98XipcdXrmO/VVXXlzIOqugr4+f50SZoOBislafEMYmpc9Vp8X5nk/UmOam7vA77ez45Jk8xgpSTtGoOYGlc9zXCZ5AHAbwO/3Cz6HPCeqvpxH/u25AxcalQYFJKk/vD4qlGwSzNcJtkN+Gwz9vv0pe6cNI0MCklSf3h81aibd9hJVd0N3JPkoQPojzQVDApJUn94fNWom/fKd+OHtMZ9nw/cPrOwqt7Ql15JE+Lcy7dw2sbN3Lh9BwcsX8b6tas59rCVrF+7mpPPuZIdd959b1uDlZK06+Y6vnY7JkuD1GvxfU5zk9SjmVDlzAmgPVQ5c7D3JCBJS6vb8RWY95gsDUJPgctJYeBSg2ToR5JGh8dkDdIuBS6bJzgYeAfwWFqzXQJQVY9akh5KE8jQjySNDo/JGhW9fs/3B4H3AHcBzwDOAj7Sr05Jk8DQjySNDo/JGhW9Ft/LqupCWsNUrquqU4D/vtidJtk7yflJrm7+3atDm0OTfDHJN5J8PcmL29Y9MsmXklyT5Owkey62L9JScLZKSRptzoipUdFr8X1HkvsBVyd5fZIXAA/ahf2eBFxYVQcDFzaPZ/sR8MqqehzwHOAvkixv1r0TOL2qHg18D3jNLvRF2iXOVilJo88ZMTUqep3h8knAvwHLgT8BHgKcVlWXLmqnyWbgqKrammR/4JKqmvNyYJKvAS8CrgG2AQ+rqruSHAmcUlVr59uvgUv1gyEeSRpfHsPVD7scuARuq6of0vq+71cvQZ/2q6qtzf2bgP3mapzkCGBP4D+AfYDtVXVXs/oGoOulxCQnACcArFq1ahe7Ld2XIR5JGl8ewzVovRbfH0hyIPAV4PPA56rqyrk2SHIB8LAOq97W/qCqKknXy+/NlfEPA+uq6p4kPXb53uc/AzgDWle+F7Sx1IMDli/reNXEEI8kjT6P4Rq0norvqnp6E2p8EnAU8OkkD6qqvefY5uhu65LcnGT/tmEnt3Rp9xDg08Db2oa43AosT7J7c/X7QMCBWRqITrOjOVulJI0vZ8TUoPUUuEzyS8D/oHXV+r8DnwJ+Zxf2uwFY19xfB5zXYZ97Av8InFVVn5xZXq1B6hfTGv/ddXtpqRmslKTJYxBTg9Zr4PIu4DJaE+18pqp+sks7TfYBPgGsAq4Djquq25KsAU6squOTvJzW94t/o23TV1XVFUkeBXwc2Bu4HHh5Vd0x334NXGpXGMqRpOnhMV+7YikCl/sCTwV+GXhDknuAL1bVHyymQ1V1K/DMDss3Acc39z9Cl4l8qupa4IjF7FtaLEM5kjQ9POarX3oadlJV24FrgW8DW4Gfo1WIS1PD2dEkaXp4zFe/9HTlO8m1wLeAf6E1zfyrd3XoiTTKDFZK0nQziKl+6XXM9/2q6p4B9KevHPOtXswEK2cfcGcCOB5wJWk6dCqyga7nCM8HmjHXmO9ei+/H0LrivV9VPT7JE4HnV9WfLm1X+8viW70wZCNJ6sZzhHoxV/Hd05hv4H3AycCdAFX1deAlS9M9abQYspEkdeM5Qruq1+L7Z6rqy7OW3dWxpTTmDNlIkrrxHKFd1Wvx/d0kPwcUQJIX0frWE2msnXv5Fp566kU88qRP89RTL+Lcy7ewfu1qlu2x207tDFZKkoB5zxGdzitSu16L798B3gsckmQL8EbgxH51ShoEZ6yUJC1Utxkxjz1sZdfzigW42vUUuLy3cfJAWgX7j4CXVNXf9atj/WDgUu0MzUiSlpLnFc1YdOAyyUOSnJzk3UmeRavoXgdcAxy39F2VBsfQjCRpKXleUS/mG3byYWA1cCXwWuBi4NeBF1TVMX3um9RXhmYkSUvJ84p6Md8Ml4+qqicAJHk/rZDlqqr6cd97Ji0hZ6yUJPWbs2KqF/Nd+b5z5k5V3Q3cYOGtcWOwUpI0CN3CmIBBTN1rzsBlkruB22ceAstojfsOUFX1kL73cAkZuJxOBmAkScPkeWj6zBW4nHPYSVXtNtd6aRwYgJEkDZPnIbXr9Xu+pbFlAEaSNEyeh9RuKMV3kr2TnJ/k6ubfvTq0OTTJF5N8I8nXk7y4bd2ZSb6d5IrmduhAfwCNpG6zijljpSRpmOY6Dzkj5vSZ79tO+uUk4MKqOjXJSc3jt8xq8yPglVV1dZIDgMuSbKyq7c369VX1ycF1WaNsJlQ5kzBvD1XOBChNmUuShqHbeQiY99ylybOgGS6XbKfJZuCoqtqaZH/gkqqa8zJkkq8BL2qK8TOBTy20+DZwObkMs0iSxo3nrsm16Bku+2i/qtra3L8J2G+uxkmOAPYE/qNt8dub4SinJ7n/HNuekGRTkk3btm3b5Y5rNBlmkSSNG89d06lvxXeSC5Jc1eG208yY1br03vXye3Nl/MPAq6vqnmbxycAhwJOAvbnvkJX25z+jqtZU1ZoVK1bs6o+lEWWYRZI0bjx3Tae+jfmuqqO7rUtyc5L924ad3NKl3UOATwNvq6pL25575qr5HUk+CLx5CbuuEedslZKkSeCMmNNpWMNONgDrmvvrgPNmN0iyJ/CPwFmzx3Y3BTtJAhwLXNXPzmp0OFulJGlSOCPmdBpW4HIf4BPAKuA64Liqui3JGuDEqjo+ycuBDwLfaNv0VVV1RZKLgBW0Ztq8otnmh/Pt18Dl+DOcIkmadJ7rxt+iZ7jsl6q6FXhmh+WbgOOb+x8BPtJle995U8pwiiRp0nmum2zOcKmxYjhFkjTpPNdNNotvjaxOs345W6UkadI5I+Zks/jWSDJYKUmaVgYxJ9tQApfDYuByfBg2kSRpZ54bx8coznApzcmwiSRJO/PcOBksvjWSDJtIkrQzz42TweJbQ2ewUpKk+RnEnAwW3xoqg5WSJPXGIOZkMHCpoTI8IknSrvFcOnoMXGpkGR6RJGnXeC4dLxbfGirDI5Ik7RrPpePF4lsDY7BSkqSlN9+51DDmaLH41kAYrJQkqT+6BTGPPWxl1/OvBfjwGLjUQBgGkSRp8Dz/DoeBSw2dYRBJkgbP8+/o2X1YO06yN3A2cBDwHeC4qvrerDaPAP6R1n8S9gD+uqr+tll3OHAmsAz4DPB7NU2X8UfYuZdv4bSNm7lx+w4OWL6M9WtXc8DyZR3/520YRJKk/pnr/NvpfO2wz/4b5pXvk4ALq+pg4MLm8WxbgSOr6lDgycBJSQ5o1r0HeC1wcHN7Tt97rHl1G1v2jENWGKyUJGnAuoUxn3HICseCD8kwi+9jgA819z8EHDu7QVX9pKruaB7en6a/SfYHHlJVlzZXu8/qtL0G77SNm9lx5907Ldtx591c/K1tBislSRqwbmHMi7+1reP5+rSNm4fT0SkytGEnwH5VtbW5fxOwX6dGSR4OfBp4NLC+qm5Msga4oa3ZDUDHKi7JCcAJAKtWrVqirqubucaWHXvYSottSZIGrNP5901nX9GxrWPB+6+vV76TXJDkqg63Y9rbNVevO47Xrqrrq+qJtIrvdUk6FundVNUZVbWmqtasWLFi0T+LeuMX/UuSNPo8Xw9PX698V9XR3dYluTnJ/lW1tRlGcss8z3VjkquApwFfAA5sW30g4CClAesU1Fi/djUnn3PlTn/Kcmy3JEmjZa7ztUHM/hrmmO8NwLrm/jrgvNkNkhyYZFlzfy/gl4DNzXCV7yd5SpIAr+y0vfrHSXMkSRpf3caCAwYx+2xok+wk2Qf4BLAKuI7WVw3e1oznPrGqjk/yLOBdtIakBHh3VZ3RbL+Gn37V4GeB353vqwadZGfp+KX9kiRNHs/vS2OuSXaGFrisqluBZ3ZYvgk4vrl/PvDELttvAh7fzz6qO7+0X5KkyeP5vf+c4VKLYlBDkqTJ4/m9/yy+NadzL9/CU0+9iEee9GmeeupF94756val/QYrJUkaX3Od37vVBFqYYX7Pt0bcTKhyJgndHqqcCVCahpYkaXJ0O78D89YE6s3QApfDYOByYQxdSJIksCZYqLkClw47UVeGLiRJElgTLCWLb3Vl6EKSJIE1wVKy+BbQOVhpqFKSJIFBzKVk8S1nq5QkSXNyRsylY+BShigkSdKiWEN0ZuBSczJEIUmSFsMaYuEsvmWIQpIkLYo1xMJZfE8Zg5WSJGmpGMRcOIvvKWKwUpIkLSWDmAtn4HKKGIqQJEmDMO01h4FLAYYiJEnSYFhzdGfxPUUMRUiSpEGw5uhuKMV3kr2TnJ/k6ubfvTq0eUSSrya5Isk3kpzYtu6SJJubdVck+dnB/gSjz2ClJEkaFoOY3Q3ryvdJwIVVdTBwYfN4tq3AkVV1KPBk4KQkB7Stf1lVHdrcbul7j8eIwUpJkjRMBjG7G0rgMslm4Kiq2ppkf+CSqup6+TXJPsDlwFOq6sYklwBvrqoFpSenJXA57SEHSZI0mqalRhnFwOV+VbW1uX8TsF+nRkkenuTrwPXAO6vqxrbVH2yGnPxBknTbUZITkmxKsmnbtm1L9gOMMkMOkiRpFFmj9LH4TnJBkqs63I5pb1etS+8dL79X1fVV9UTg0cC6JDNF+suq6gnA05rbK7r1o6rOqKo1VbVmxYoVS/KzjTpDDpIkaRRZo8Du/Xriqjq627okNyfZv23YyZxjtpuhJlfRKrQ/WVVbmuU/SPJR4AjgrCXs/tg49/ItnLZxMzdu38EBy5exfu1q1q9dzcnnXMmOO+++t53BSkmSNGzz1Sid6ppJy6YNa9jJBmBdc38dcN7sBkkOTLKsub8X8EvA5iS7J9m3Wb4H8DzgqoH0esQYrJQkSeOkWxDz2MNWdq1rJi2MOazA5T7AJ4BVwHXAcVV1W5I1wIlVdXySZwHvojUkJcC7q+qMJA8EPgfsAewGXAD8flXd3Wlf7SYtcDktoQVJkjT5JqmumStw2bdhJ3OpqluBZ3ZYvgk4vrl/PvDEDm1uBw7vdx/HgaEFSZI0KaalrnGGyzFmaEGSJE2KaalrLL7HhDNWSpKkSTYts2JafI8Bg5WSJGnSTcusmEMJXA7LuAYuJymAIEmStBDjWAeN4gyXWoBpCSBIkiTNNml1kMX3GJiWAIIkSdJsk1YHWXyPGIOVkiRJPzVpQUyL7xFisFKSJGlnkxbENHA5QsYxUCBJkjQMo1w3GbgcE5MWKJAkSeqXca2bLL5HyKQFCiRJkvplXOsmi+8h6BYOMFgpSZLUm3ENYu4+7A5Mm5lQ5Y477wZ2DlXOBChP27iZG7fv4IDly1i/drXBSkmSpFm61U3AvLXWMBm4HLBRDgdIkiSNu1GotQxcjpBxDQdIkiSNg1GvtSy+B2xcwwGSJEnjYNRrraEV30n2TnJ+kqubf/eao+1DktyQ5N1tyw5PcmWSa5L8VZIMpue9c7ZKSZKkwRr1WmuYV75PAi6sqoOBC5vH3fwJ8LlZy94DvBY4uLk9px+dXCxnq5QkSRq8bjNijkqtNbTAZZLNwFFVtTXJ/sAlVXWf/5IkORxYD/wTsKaqXt+0v7iqDmnavLR5rt+aa5+DDFyOwmB/SZIkDd6oBi73q6qtzf2bgP1mN0hyP+BdwJtnrVoJ3ND2+IZm2X0kOSHJpiSbtm3btuu97tGoD/aXJEnS4PX1e76TXAA8rMOqt7U/qKpK0ukS/OuAz1TVDYsd0l1VZwBnQOvK96KeZBEOWL6s45XvURnsL0mSpMHra/FdVUd3W5fk5iT7tw07uaVDsyOBpyV5HfAgYM8kPwT+Ejiwrd2BwOhMXURrsH/7F7zDaA32lyRJ0uANc9jJBmBdc38dcN7sBlX1sqpaVVUH0Rp6clZVndQMV/l+kqc033Lyyk7bD9OoD/aXJEnS4A1zevlTgU8keQ1wHXAcQJI1wIlVdfw8278OOBNYBny2uY2UYw9babEtSZKkezm9vCRJkrSERvXbTiRJkqSpYvEtSZIkDYjFtyRJkjQgFt+SJEnSgExV4DLJNlrfrDJo+wLfHcJ+NXi+1tPD13p6+FpPD1/r6dHv1/oRVbWi04qpKr6HJcmmbolXTRZf6+nhaz09fK2nh6/19Bjma+2wE0mSJGlALL4lSZKkAbH4Howzht0BDYyv9fTwtZ4evtbTw9d6egzttXbMtyRJkjQgXvmWJEmSBsTiW5IkSRoQi+8llOQ5STYnuSbJSR3W3z/J2c36LyU5aAjd1BLo4bX+/STfTPL1JBcmecQw+qldN99r3dbu15JUEr+mbEz18lonOa75bH8jyUcH3UctjR6O4auSXJzk8uY4/txh9FO7JskHktyS5Kou65Pkr5r3wdeT/OIg+mXxvUSS7Ab8DfArwGOBlyZ57KxmrwG+V1WPBk4H3jnYXmop9PhaXw6sqaonAp8E/t/B9lJLocfXmiQPBn4P+NJge6il0strneRg4GTgqVX1OOCNg+6ndl2Pn+v/BXyiqg4DXgL878H2UkvkTOA5c6z/FeDg5nYC8J4B9MniewkdAVxTVddW1U+AjwPHzGpzDPCh5v4ngWcmyQD7qKUx72tdVRdX1Y+ah5cCBw64j1oavXyuAf6E1n+mfzzIzmlJ9fJavxb4m6r6HkBV3TLgPmpp9PJaF/CQ5v5DgRsH2D8tkar6HHDbHE2OAc6qlkuB5Un273e/LL6Xzkrg+rbHNzTLOrapqruA/wL2GUjvtJR6ea3bvQb4bF97pH6Z97Vu/kz58Kr69CA7piXXy+f6McBjknwhyaVJ5rqiptHVy2t9CvDyJDcAnwF+dzBd04At9Hy+JHbv9w6kaZbk5cAa4OnD7ouWXpL7AX8OvGrIXdFg7E7rz9NH0fpr1ueSPKGqtg+zU+qLlwJnVtW7khwJfDjJ46vqnmF3TOPPK99LZwvw8LbHBzbLOrZJsjutP2XdOpDeaSn18lqT5GjgbcDzq+qOAfVNS2u+1/rBwOOBS5J8B3gKsMHQ5Vjq5XN9A7Chqu6sqm8D/06rGNd46eW1fg3wCYCq+iLwAGDfgfROg9TT+XypWXwvna8AByd5ZJI9aQU0NsxqswFY19x/EXBROcvROJr3tU5yGPBeWoW340LH15yvdVX9V1XtW1UHVdVBtMb3P7+qNg2nu9oFvRzDz6V11Zsk+9IahnLtAPuopdHLa/2fwDMBkvw8reJ720B7qUHYALyy+daTpwD/VVVb+71Th50skaq6K8nrgY3AbsAHquobSf4Y2FRVG4D/j9afrq6hFQB4yfB6rMXq8bU+DXgQ8PdNpvY/q+r5Q+u0FqXH11oToMfXeiPw7CTfBO4G1leVf70cMz2+1v8DeF+SN9EKX77Ki2XjJ8nHaP2Hed9m/P4fAXsAVNXf0hrP/1zgGuBHwKsH0i/fS5IkSdJgOOxEkiRJGhCLb0mSJGlALL4lSZKkAbH4liRJkgbE4luSJEkaEL9qUJJGXJJ9gAubhw+j9TV3M985fERV/WQoHesgyVHAT6rqX4fcFUkaSRbfkjTimu+SPhQgySnAD6vqz4bVnyS7V9VdXVYfBfwQ6Ln4nuf5JGmiOOxEksZQksOT/HOSy5JsTLJ/s/ySJKcn2ZTk35I8Kck5Sa5O8qdNm4OSfCvJ3zVtPpnkZ3p43r9Isgn4vSS/muRLSS5PckGS/ZIcBJwIvCnJFUmeluTMJC9q6/cPm3+PSvL5JBuAbybZLclpSb6S5OtJfmugv1BJGhCLb0kaPwH+GnhRVR0OfAB4e9v6n1TVGuBvgfOA3wEeD7yqGcICsBr431X188D3gdcl2WOe592zqtZU1buAfwGeUlWHAR8H/mdVfafZ5+lVdWhVfX6en+MXgd+rqscAr6E1tfOTgCcBr03yyIX/aiRptDnsRJLGz/1pFdPnJ4HWFNlb29bPTHt/JfCNqtoKkORa4OHAduD6qvpC0+4jwBuAf5rnec9uu38gcHZzZXxP4NuL+Dm+XFUz2z0beGLbVfKHAgcv8nklaWRZfEvS+AmtovrILuvvaP69p+3+zOOZ437N2qZ6eN7b2+7/NfDnVbWhCVme0mWbu2j+yprkfrQK9U7PF+B3q2pjl+eRpIngsBNJGj93ACuSHAmQZI8kj1vgc6ya2R74DVrDSDYv4HkfCmxp7q9rW/4D4MFtj78DHN7cfz6wR5fn2wj8djP0hSSPSfLA3n8cSRoPFt+SNH7uAV4EvDPJ14ArgP+2wOfYDPxOkn8D9gLe03xlYa/Pewrw90kuA77btvz/AC+YCVwC7wOe3jzfkex8tbvd+4FvAl9NchXwXvzrrKQJlKrZf3mUJE2y5ltJPlVVjx92XyRp2njlW5IkSRoQr3xLkiRJA+KVb0mSJGlALL4lSZKkAbH4liRJkgbE4luSJEkaEItvSZIkaUD+f9rr1xRThuWnAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
          " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def reward(temp):\n", + " delta = abs(temp - 0.5)\n", + " if delta < 0.1:\n", + " return 0.0\n", + " else:\n", + " return -delta + 0.1\n", + "\n", + "temps = [x * 0.01 for x in range(100)]\n", + "rewards = [reward(x) for x in temps]\n", + "\n", + "fig=plt.figure(figsize=(12, 4))\n", + "\n", + "plt.scatter(temps, rewards)\n", + "plt.xlabel('Temperature')\n", + "plt.ylabel('Reward')\n", + "plt.title('Reward vs. Temperature')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Environment Setup\n", + "\n", + "The environment responds to actions. It is what keeps track of the temperature state of the room, returns the reward for being in that temperature state, and tells you if the episode is over or not (in this case, we just set a max episode length that can happen).\n", + "\n", + "Here is the gist of the flow:\n", + "\n", + "- Create an environment by calling `Environment.create()`, see below, telling it to use the class you created for this (here, the ThermostatEnvironment) and the max timesteps per episode. The enviroment is assigned to the name `environment`.\n", + "- Initialize the environment `environment` by calling `environment.reset()`. This will do stuff, most importantly, it will initialize the `timestep` attribute to 0.\n", + "- When you want to take an action on the current state of the environment, you will call `environment.execute()`. If you want to have the heater off, you call `environment.execute(0)`, and if you want to have the heater on, you call `environment.execute(1)`.\n", + "- What the `execute()` call returns is a tuple with 3 entries:\n", + " - __state__. In this case, the state is the current temperature that results from taking the action. If you turn on the heater, the temperature will rise from the previous state, and if the heater was turned off, the temperature will fall from the previous state. This should be kept as a numpy array, even though it seems like overkill with a single value for the state coming back. For more complex examples beyond this thermostat, there will be more than 1 component to the state.\n", + " - __terminal__. This is a True/False value. It is True if the episode terminated. In this case, that will happen once you exceed the max number of steps you have set. Otherwise, it will be False, which lets the agent know that it can take further steps.\n", + " - __reward__. This is the reward for taking the action you took.\n", + "\n", + "Below, to train the agent, you will have the agent take actions on the environment, and the environment will return these signals so that the agent can self-train to optimize its reward." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "###-----------------------------------------------------------------------------\n", + "## Imports\n", + "from tensorforce.environments import Environment\n", + "from tensorforce.agents import Agent\n", + "\n", + "\n", + "\n", + "###-----------------------------------------------------------------------------\n", + "### Environment definition\n", + "class ThermostatEnvironment(Environment):\n", + " \"\"\"This class defines a simple thermostat environment. It is a room with\n", + " a heater, and when the heater is on, the room temperature will approach\n", + " the max heater temperature (usually 1.0), and when off, the room will\n", + " decay to a temperature of 0.0. The exponential constant that determines\n", + " how fast it approaches these temperatures over timesteps is tau.\n", + " \"\"\"\n", + " def __init__(self):\n", + " ## Some initializations. Will eventually parameterize this in the constructor.\n", + " self.tau = 3.0\n", + " self.current_temp = np.random.random(size=(1,))\n", + "\n", + " super().__init__()\n", + "\n", + "\n", + " def states(self):\n", + " return dict(type='float', shape=(1,), min_value=0.0, max_value=1.0)\n", + "\n", + "\n", + " def actions(self):\n", + " \"\"\"Action 0 means no heater, temperature approaches 0.0. Action 1 means\n", + " the heater is on and the room temperature approaches 1.0.\n", + " \"\"\"\n", + " return dict(type='int', num_values=2)\n", + "\n", + "\n", + " # Optional, should only be defined if environment has a natural maximum\n", + " # episode length\n", + " def max_episode_timesteps(self):\n", + " return super().max_episode_timesteps()\n", + "\n", + "\n", + " # Optional\n", + " def close(self):\n", + " super().close()\n", + "\n", + "\n", + " def reset(self):\n", + " \"\"\"Reset state.\n", + " \"\"\"\n", + " # state = np.random.random(size=(1,))\n", + " self.timestep = 0\n", + " self.current_temp = np.random.random(size=(1,))\n", + " return self.current_temp\n", + "\n", + "\n", + " def response(self, action):\n", + " \"\"\"Respond to an action. When the action is 1, the temperature\n", + " exponentially decays approaches 1.0. When the action is 0,\n", + " the current temperature decays towards 0.0.\n", + " \"\"\"\n", + " return action + (self.current_temp - action) * math.exp(-1.0 / self.tau)\n", + "\n", + "\n", + " def reward_compute(self):\n", + " \"\"\" The reward here is 0 if the current temp is between 0.4 and 0.6,\n", + " else it is distance the temp is away from the 0.4 or 0.6 boundary.\n", + " \n", + " Return the value within the numpy array, not the numpy array.\n", + " \"\"\"\n", + " delta = abs(self.current_temp - 0.5)\n", + " if delta < 0.1:\n", + " return 0.0\n", + " else:\n", + " return -delta[0] + 0.1\n", + "\n", + "\n", + " def execute(self, actions):\n", + " ## Check the action is either 0 or 1 -- heater on or off.\n", + " assert actions == 0 or actions == 1\n", + "\n", + " ## Increment timestamp\n", + " self.timestep += 1\n", + " \n", + " ## Update the current_temp\n", + " self.current_temp = self.response(actions)\n", + " \n", + " ## Compute the reward\n", + " reward = self.reward_compute()\n", + "\n", + " ## The only way to go terminal is to exceed max_episode_timestamp.\n", + " ## terminal == False means episode is not done\n", + " ## terminal == True means it is done.\n", + " terminal = False\n", + " \n", + " return self.current_temp, terminal, reward\n", + "\n", + "###-----------------------------------------------------------------------------\n", + "### Create the environment\n", + "### - Tell it the environment class\n", + "### - Set the max timestamps that can happen per episode\n", + "environment = environment = Environment.create(\n", + " environment=ThermostatEnvironment,\n", + " max_episode_timesteps=100)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Agent setup\n", + "\n", + "Here we configure a type of agent to learn against this environment. There are many agent configurations to choose from, which we will not cover here. We will not discuss what type of agent to choose here -- we will just take a basic agent to train." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "agent = Agent.create(\n", + " agent='tensorforce', environment=environment, update=64,\n", + " optimizer=dict(optimizer='adam', learning_rate=1e-3),\n", + " objective='policy_gradient', reward_estimation=dict(horizon=1)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check: Untrained Agent Performance\n", + "\n", + "Let's see how the untrained agent performs on the environment. The red horizontal lines are the target bands for the temperature.\n", + "\n", + "The agent doesn't take actions to try and get the temperature within the bands. It either initializes a policy to the heater always off or always on." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtAAAAEWCAYAAABPDqCoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAACDJklEQVR4nO3dd3gj13U34N8BQBAdYO+d3N53tdKqrorVbEuOXOUS20lckthxnPalusZ24thxXBPXuBe5y7FslVXvu9L2yl0uey8gQYIgUe73x8yAIIky6O28z8NHIgECdwlg5sy9555DQggwxhhjjDHG1NFkewCMMcYYY4zlEw6gGWOMMcYYiwMH0IwxxhhjjMWBA2jGGGOMMcbiwAE0Y4wxxhhjceAAmjHGGGOMsThwAM0YYyxtiOgtRPRQtsfBGGOpxAE0YyxvENF8yFeAiBZDvn9LtseXCCLqJaJbsj2ORBHRP4a8Bh4i8od8f1oI8QMhxK0ZHtNHiOj7mXxOxlhx4QCaMZY3hBAW5QtAP4BXh/zsB9ke31pEpCuE54hGCPHJkNfkvQCeC3lNtmZzbIwxli4cQDPG8h4RaYjo74noEhFNEdF9RFQu39ZKRIKI3klEA0Q0Q0TvJaIriOgEETmJ6Eshj/UOInqGiL5ERLNEdI6Ibg653U5E3ySiESIaIqJ/JSLtmt/9HBFNAfgIEXUQ0aPyuCaJ6AdE5JDv/z0AzQB+I8/Y/h0RHSSiwTX/vuAstTy7+jMi+j4RzQF4R7QxrXmcennWvjzkZ7vlcZUQUScRPSH/uyeJ6CcpeG3eQURPh3wviOjPiKibiFxE9HH5b/QsEc3Jr50+5P6vIqJj8uv0LBHtCLnt/8n/XhcRnSeim4nodgD/COCN8t/0eByvW9jXnDHG1uIAmjFWCN4P4DUAbgBQD2AGwJfX3OdKAF0A3gjgvwD8E4BbAGwF8AYiumHNfS8BqATwYQC/CAk6vw3AB6ATwG4AtwL4kzW/2wOgBsAnABCAT8nj2gygCcBHAEAI8Tasnkn/tMp/790AfgbAAeAHKsYE+fmGATwH4LUhP34zgJ8JIbwAPg7gIQBlABoBfFHleOJ1G4C9AK4C8HcAvgbgrZD+NtsA3AtIwT2AbwF4D4AKAF8FcD8RlRLRRgDvA3CFEMIqP2avEOL3AD4J4Cfy33Sn/JzfRuzXLdJrzhhjq3AAzRgrBO8F8E9CiEEhxBKkAPV1a9IbPi6E8AghHgKwAOBHQohxIcQQgKcgBVWKcQD/JYTwCiF+AuA8gFcSUQ2AOwH8pRBiQQgxDuBzAN4U8rvDQogvCiF8QohFIcRFIcTDQoglIcQEgP+EFOgn4zkhxK+EEAEANhVjCvVDrASoJN/vh/JtXgAtAOrlv9XT4R8iaZ8WQswJIU4DOAXgISFEjxBiFsDvsPJavBvAV4UQLwgh/EKI7wBYghR4+wGUAthCRCVCiF4hxKVwT6bydQv7mqf8X84YKwhZzZ1jjLEUaQHwSyIKhPzMD2kWWDEW8v+LYb63hHw/JIQQId/3QZpBbgFQAmBEij0BSBMRAyH3Df1/JXj7PIDrAFjl+8+o+ldFFvocasYU6ucAvkhEdQA2AAhAuoAApNngjwN4kYhmAHxWCPGtJMcaTqzXolb+/xYAbyei94fcrocU4D9BRH8J6WJpKxE9COCv5Fn2tdT8jSK95owxtg7PQDPGCsEAgDuEEI6QL4M8u5yIBgqJtCDlKQ/Lz7MEoDLkeWxrNsuFBmGAlE4gAGwXQtggpSpQlPsvADAp38h5ulVr7hP6O2rGtPKLQsxAStN4I6T0jR8rgaMQYlQI8S4hRD2ktImvEFFnuMfJkAEAn1jzupqEED+Sx/tDIcS1kAJkAeDf5d9b+zdV8zeK9Jozxtg6HEAzxgrB/wD4BBG1AAARVRHR3Uk8XjWAv5A31r0eUu7yA0KIEUjB52eJyEbS5sWONfnTa1kBzAOYJaIGAH+75vYxAO0h318AYCCiVxJRCYB/hpSqEFaCY/ohgD8E8DqspG+AiF5PRI3ytzOQAtHA+l/PmK8DeC8RXUkSs/x3sRLRRiK6iYhKAXggzVwrYx0D0EpEGkD13yjsa56pfyhjLL9wAM0YKwSfB3A/gIeIyAXgeUibwhL1AqQNh5OQNgK+TggxJd/2h5DSCM5ACjJ/BqAuymN9FMAeALMAfgvgF2tu/xSAf5arTPyNnAf8ZwC+AWAI0oz0IKKLd0z3y/++USHE8ZCfXwHgBSKal+/zASFEDwAQ0WnKcK1tIcQRAO8C8CVI/66LAN4h31wK4N8gvUajkALgf5Bv+6n83ykieln+/1h/o2ivOWOMrUKrU74YY6y4EdE7APyJnBrAigC/5oyxePEMNGOMMcYYY3FIWwBNRN8ionEiOhXhdiKiLxDRRZKaGexJ11gYY4wxxhhLlbSlcBDR9ZA2znxXCLEtzO13Qmp+cCekXMXPCyGSyVlkjDHGGGMs7dI2Ay2EeBLAdJS73A0puBZCiOcBOOS6pIwxxhhjjOWsbDZSacDqIvaD8s9G1t6RiN4NqSMVzGbz3k2bNmVkgIwxxhhjrHi99NJLk0KItbX486MToRDiawC+BgD79u0TR44cyfKIGGOMMcZYoSOivnA/z2YVjiEATSHfN8o/Y4wxxhhjLGdlM4C+H8AfytU4rgIwK3eLYowxxhhjLGelLYWDiH4E4CCASiIaBPBhACUAIIT4H0gtUu+E1FnKDeCd6RoLY4wxxhhjqZK2AFoIcW+M2wWAP0/X8zPGGGOMMZYO3ImQMcYYY4yxOHAAzRhjjDHGWBw4gGaMMcYYYywOHEAzxhhjjDEWBw6gGWOMMcYYiwMH0IwxxhhjjMWBA2jGGGOMMcbiwAE0Y4wxxhhjceAAmjHGGGOMsThwAM0YY4wxxlgcOIBmjDHGGGMsDhxAM8YYY4wxFgcOoBljjDHGGIsDB9CMMcYYY4zFgQNoxhhjjDHG4sABNGOMMcYYY3HgAJoxxhhjjLE4cADNGFvnxKAT//vM5WwPgzHGGMtJumwPgDGWW+aXfPjT77+M4dlFvOXKFuh1fJ3NGGOMheIzI2Nslc88eB5DzkUIAYy7PNkeDstjQgjMur3ZHgZjjKUcB9CMsaCX+qbxned6sanWCgAYneUAmiXu84e6ceDfDmFhyZftoTDGWEpxAM0YAwB4vH783c9OoN5uxKfu2Q4AGOYAmiWof8qNrzx+Ce5lPwZnFrM9HMYYSykOoBljAIAvP3YRlyYW8Ml7tqOz2gIAGJ3lwIcl5l9/ewbLvgAAYITfR4yxAsMBNGMMZ0fm8N+PX8I9expww4YqWA0lsJTqMOzkGWgWvycvTOChM2N4y5XNAIARXslgjBUYDqAZK3I+fwD/7+cn4DCV4F9euSX48zq7gXOgWdy8/gA++pvTaKkw4R/v3AwiYMTJM9CMscLCATRjRe5bz1zGicFZfOSurSgz64M/r7UbMDLHATSLz3ee7cWliQV86FVbYC7VodpamjO59P6AwFu+8Tx+8EJftofCGMtzHEAzVsR6Jxfw2Ycu4BVbavDK7XWrbquzG3jmkMVlwrWEzz/SjRs3VuHmzTUAgFq7MWdWMv7vxDCeuTiFZy5OZnsojLE8xwE0Y0Xskw+chV6rwcfv3gYiWnVbnd2IifkleP2BLI2O5ZtP//4cPD4//uVVK6lA9XYDhnNgE2EgIPClRy8CAIY4t58xliQOoBkrUhfHXXjozBjeeU0rau2GdbfX2Q0QAhjjNA6mwrEBJ3760iD+6No2tFdZgj+vsxsx4vRACJHF0QG/Pz2K7vF5VFlLeWWFMZY0DqCLhM8fgNO9nO1hsBzy1Sd6YCjR4O1Xt4a9vc5hBMDNVFhsgYDAh+8/jWprKd5/U9eq2+odBix6/ZhbzF4zFSEEvvjoRbRXmnHvFU2YmF8KlthjjLFEcABdJD71u3O49XNPZn0WiOWG0VkPfnVsCG/Y14QKS2nY+9TJs9JcgozF8sujQzg+4MQ/3LkJllLdqtuU1Y1spnE8cnYcZ0fm8Oc3dqKx3MQrK4yxpHEAXQTmPF786MV+jLuW4HR7sz0clgO+9cxlBATwruvaI96nNhhA83I3i0wIga8/1YPNdTa8ZlfDutvr7NJKRrbeR9Lsczeay024e1c96uXxDHEaB2MsCRxAF4GfvzQI97IfAJ80GDC76MUPX+jHK7fXoancFPF+NrmZCs9As2gO987g3KgL77i6Zd1GVEBK4QCQtaY8j1+YwInBWfzZwQ7otJrgePjCkDGWDA6gC1wgIPDd5/pQZioBAAxzAJ1XfvxiPy6Ou1L6mN9/vg/zSz68+/rIs8+KWrsBI1yxgEXxned6YTeW4K6d62efAaDaaoBWQ1nJpRdC4IuHutHgMOKePY0AVmbEucsmYywZHEAXuCe7J3B5ciG4sYdnoPPHxXEX/v4XJ/G951LX9MHj9eN/n+nFdV2V2NZgj3n/Om6mwqIYm/PgwVOjeMO+Rhj12rD30WpIbqaS+WPPs5em8HK/E+892AG9TjrdGfValJv1fCxkjCUlrQE0Ed1OROeJ6CIR/X2Y25uJ6DEiOkpEJ4joznSOpxh997k+VFpK8ZarmmEo0fAMdB6578gggNRe9Pzi5SFMzi/hT2/oUHV/qZ03v2dYeD98oR9+IfDWq1qi3q8uSysZXzjUjRpbKV6/tzHMePh9zRhLXNoCaCLSAvgygDsAbAFwLxFtWXO3fwZwnxBiN4A3AfhKusZTjPqmFvDY+XG8+cpmlOq0qHcYedkyT3j9AfziZSmAHpxJzYneHxD42pOXsKPRjgMdFap+p9ZuxLiLm6mw9ZZ9AfzwxX4c3FCFlgpz1PvWOYwZzzl+oWcKL1yexnuu74ChZPXsOB8LGWPJSucM9H4AF4UQPUKIZQA/BnD3mvsIADb5/+0AhtM4nqLz3ef6oCXCW65sBgA0OIwY5FmXvHDo7Dgm55fRWmFK2Qz0g6dH0Tvlxnuu7wi72SucermZyrhrKSVjYIXj96dHMeFawh9GqCMeqt5uwMhsZpupfPHRi6i06HHv/uZ1tzU4jDnRHZExlr/SGUA3ABgI+X5Q/lmojwB4KxENAngAwPvDPRARvZuIjhDRkYmJiXSMteAsLPlw35EB3LG9DjU2add5vd3IKRx54r4jA6i2luKNVzTD5fFhzpNc+UEhBL76xCW0Vphw+7Za1b8XLGXH7xu2xvee60VLhQk3dFXFvG+t3YglXwAzGSqjeXlyAU9fnMQ7r2kLm5tdZzfA5fHBleTnijFWvLK9ifBeAN8WQjQCuBPA94ho3ZiEEF8TQuwTQuyrqop9sGbAr44NweXx4R1Xr+QmNpQZMeFawpLPn8WRsVhGZz14/Pw4Xre3Ec1ymbmhJNM4nuuZwvHBWbzr+nZoNepmn4HQGr683M1WnBmew+HeGbztqhZoVLyf6pVmKhm6ELv/2DCIgHv2hK8MUu/g9zVjLDnpDKCHADSFfN8o/yzUHwO4DwCEEM8BMACoTOOYioIQAt95thdb623Y01wW/HnwpMG5fznt5y8PIiCAN+xrQkOZ3PQhyQD6W0/3otKix2v3NMa+c4g6uWYut/Nmob73fC8MJRq8fm9T7DtjpS18JgJWIQR+fXwIV7aVBy8A11JqQXMlDsZYotIZQB8G0EVEbUSkh7RJ8P419+kHcDMAENFmSAE052gk6bmeKVwYm8fbr25dleu60tCATxq5KhAQuO/IAK5sK0drpRkNjuS7ps0uevHEhXG8ZlfDus1UsVhLdTDrtZwvyoJm3V788ugQXrOrAXa5vnwsygx0Jiq6nB6eQ8/EQsS61ABPJrDkved7R/AfD57L9jBYFunS9cBCCB8RvQ/AgwC0AL4lhDhNRB8DcEQIcT+AvwbwdSL6IKQNhe8QmdxlUqC+82wvykwluGtn/aqfNzrkdAAOoHPWC5en0Tflxgdulup2V1r0KNVpknrNHjkzBq9f4M4ddXH/LhGhzmHkGWgW9NOXBuDxBvC2A9FL14WqtJRCpyEMZ+B99OtjQyjREu6IkuuvNHfhyQSWiGHnIh48Pcabq4tc2gJoABBCPABpc2Dozz4U8v9nAFyTzjEUmyHnIh4+M4b33LC+dFONvRREHEDnsvuODMBaqsMd26Rgl4jQ4DAmlcLxwMkR1NsN2N3kSOj36+yGjAQ+LPcFAgLfe74P+1rKsLU+diMehUZDqLGlv/ZyICDwm+MjuGFDFcrM+oj302oItTYDB9AsIb8/NQogdSVGWX7K9iZCJkvVxPsPX5C61oVrbFCq06LKUsonjRw1u+jFAydHcNeu+lWVAxrKEi8/OOfx4qnuSdyxvU516bq1am3cTIVJnuieQN+UW1XpurXqHYa050C/2DuN0TkP7toVOX1DIV0Y8vuaxU8JoCdcS/B4eVN+seIAOgcIIXD3l5/B5x/pTvqxfndyFNd0VgZzZ9dqKOMGArnq/uPDWPIF8MYrVm/MSmYG+pEzY1j2B3Dn9vjTNxR1Dm6mwiQ/OzKISoset29VXwpRUWc3pj2A/vWxYZj0WtyyuTrmfbmZCkvEuMuDw33TaK+UmgfxLHTx4gA6B3SPz+PE4CyO9E0n9TiXJubRM7mAW7fURLxPvcPIKRw56r7DA9hUa8X2htVL4w0OIybnE5vpSDZ9A5Bm6riZClvy+fHEhQm8YksN9Lr4Tx11DgNGZz0IBNKzzWXZF8ADJ0fwii01MOljZyfWy7n96RoPK0wPnR6DEMAfX9cGABiccWd5RCxbOIDOAY+cHQOQ/JXsI2ekx7l5c+QAukEOoHmvZm45MzyHk0OzeOMVTetSLZRSdvGm3sx5vHjygpS+oaZWbyS1GaygwHLXCz3TmF/y4eZNkY8v0dTZDFj2BzC1sJzikUme6p7A7KIXd++qj31nSCkly/4AJhf4wpCp9/tTo2ivNAc/BwM8A120OIDOAYfOjgOQav0mMxvyyNkxbK23BUs0hdPgMGLZl76TGEvMfUcGoNdq8JowuZuJlrI7dDb59A1A6mAJgJe7i9yhs2Mo1WlwTWdipfqVWtDpqujy62PDKDOV4DoVnRGBlfc1l7Jjas0sLOO5nincvq0W1dZS6LUanoEuYhxAZ9n0wjJe7p9BrTw7M+ZK7GA+Nb+El/pmcEuU2Wdgpf5pso05WOos+wL45dEh3Lq1JmzlgERfs9+eGEVdkukbQOgMNAcaxUoIgUfOjuPazsqwrbHVCF6IpWElY2HJh4fPjOHO7XUo0ao7rdVxXXwWp4fPjMEfELhjm7Sq11Bm5BzoIsYBdJY9dm4cQgBvvaoZADAwndiH8bHzEwgI4BVR8p8BbqaSi470TWN20buubrei1m6AhuJ7zaT0jYnggT4ZNoPUTCXaBrBnL01i/ycewQyvbBSk82MuDDkXcUuM40s0SsCajlJ2j5wdw6LXj7tVVN9QKCs7XKKRqfW7UyNoLDNiW4MNANBYZsTgNM9AFysOoLPs0LkxVFtLcbtc93cgwQ/jI2fGUGszYGu9Ler9uJlK7nnywiR0GsKBjoqwt5doNai1GeIqZaekb7xyR/zVEtYiItTaDRiJMnP4m+PDGHct4dyoK+nnY7lHSTO7eVPs6haRlJv00Gs1aanE8etjw6izG7CvpUz179iNJTCWaHkygaky5/Hi6YuTuH1rbXCfSmOZiWegixgH0Fm07AvgyQuTuHlzNRrLjCACBhLIp/J4/XiyewK3bKmOWevXZpRmEzmAzh1PXpjAnpYyWA2R2yI3lMVXyu63J0ZRazNgd5P6gCKaaCXIhBB48sIkgMQvAFlue+TsGHY02lFtMyT8GBqNciGW2gB6ZmEZT16YwF076+NabSEi1Du4mQpT59Gz4/D6Be7YvjIp0VhmxNTCMtzLviyOjGULB9BZ9OLllV3thhItaqyGhFI4nuuZgnvZHzP/GVBOGkY+aeSICdcSzozM4YYN0Tc+NcRRftDl8eLJ7gncmWT1jVB1UWage6fcwbH1TS+k5PlY7picX8KxAWfC1TdCRXsfJeqBUyPwBQTuUll9I1S9w8gpHEyV350aQY2tdNWkRFO5tKLLs9DFiQPoLHpkza72pnJjQjN4j5wZg0mvxVXt4VMA1uJmKrnj6YsTAIDrY1QOaCiTatb6VVRpOXR2HMu+1KRvKOrshojNVJ7qlv4NJr0W/Qnm8LPc9ai8T+NmFc1JYklH85JfHxtGZ7UFW+qip6+FHY+dJxNYbO5lH564MIHbttaumpRolEuMciWO4sQBdJYIIXDo3BiuCdnV3lRmijuFQ9odP4bru6pgKFG3O56bqeSOJy9MosKsj5m73uAwwRcQGJuLHXz89uRIStM3AKDWboQQ0oz5Wk9emERzuQm7mx3o5xSOgnPo7Bjq7LH3V6hRazdgbE7dhaAa4y4PXrw8jbt21ifUqr7eYcSEawlLPm7HzCJ7/PwEPN4Abt+2elJiJYDm82kx4gA6Sy6Oz2NgenHVrE5juQmjc564DuanhuYwNrcU1+74BocR0wvLWFzmk0Y2BQICT3VP4NquypipFkozlVgXPi6PF09cmMAd22tTlr4BhFRQWLP87vUH8NylSVzXVYnmcjPnQBcYj9ePp7oncdOm2Psr1Ki3G+ALCEzNp6Z5yfM9UvfWgxvV1X5eS3lfj81yMxUW2QMnR1Bu1mN/a/mqn1dZSlGq0/Bxr0hxAJ0ljwR3ta8Evk1l0ixfPEucD58dg4aAG+M4gayUb+Kr5mw6MzKHyfnlmOkbQEgzlRgzHY+ek9M3kmyesladXQmgV783j/Y7sbDsx3VdVWguN2F6YRkujzelz82y5/k49leoUWdPbem453umYC3VJZS+AfCxkMXm8frx2Llx3La1Bro1NcaJSCplxzPQRYkD6Cw5JHcNVJpUACsbEuK5mn3kzBj2tpShwlKq+ndyoZmK1x/APV95Bg+eHs3aGLLtSTl3+Lqu2J3d1HYjfEBO39jTnLr0DWAl8Fnbte2p7glo5RJ8zcH3L59MCsWhs+MwlmgjlliMV6prQT/fM4Ur2srXBTaqx2Pnuvgsuqe6J7Gw7A+Wml2LS9kVLw6gs0DpPnjzmlmdYACtMg96yLmIMyNzcc8O5UIzldPDc3i534knL0xkbQzZ9uSFCWyqtaoqDWbUa1Fh1kc9UPsDAs9emsKNm6pTmr4BSM1UTGGaqTzVPYldTQ7YjSXBAJrzoAuDEAKHzo7huq5K1fsrYgleiKVgBnpszoOeiQUcULl5OhxlMiEdtalZYfjdqRHYDLqI77PGMmNC5WdZ/uMAOgsePz+OgABuWbOrvdZmQImWVM/gHTo7BgBxdwertcXf2S7VDl+WcheLNdhaWPLhpb6ZmOXrQjWURd/8eX7UBZfHh/1tqZ19BlaaqYzOrTy/072ME4NOXCtXkWlOYAWF5a4zI3MYnvWkLH0DAMpMJSjVaVJSyu75nikAUF19KBxDiRblZj1vqmZhBQICj54bxy2ba6DXhQ+XmspNcLq9nLpWhDiAzoJDZ8dRbS3Ftnr7qp9rNYQGh/qr2YfPjKG90oyOKktcz69LoLNdqr3YKwXQvVPFWTf4uUtT8PoFro8ngHYYMRTlvXFY/ptesWajS6pIJb9WZuqevTSFgACu3yAF0HZTCWwGXdFeFBWaQ2fHQQTcmET3wbWCdehTMOP7fM80rAYdtiRZHYSbqbBILk7Mw+n24qooKUyNKjd4s8LDAXSGSd0HJ3BThGX2pnITBlUEIC6PF8/3TMU9+6zIZjOVQEDgiBzsDc0sYtm3vrZwoXuyewLGEi32taqfLVaaqQgRvgTYi73TqLcb0FhmStUwV6m1GzAaEvg81T0Ba6kOOxsdwZ81V5jQxwF0QTh0dgw7Gx2osqrfX6FGnd2Qkhzo53umcGVbObRJpivV243rcvsZA4AjvTMAok9KKMdb3vtRfDiAzrDDvdNwLfnW5T8rGstMGFCxIeHJC5Pw+kXCy6vZbKZyaWIeM24vrmovR0AUZxH6Jy9M4Kr2cpTq1OeWNpQZ4fEGML2wvO42IQQOX57GFW3pmX0GpBJk4y4PfP5AsH331Z0VqzZwtXApu4IwPufB8cHZdWlmqVBnN666EEvE6KwHlycXkkrfUHBnVhbJkd5pVJj1aK2IPCnRxM1UihYH0BmmdB9U8kbXaiqXajQvLPmiPs6hs2MoM5VgT7MjoXHUO4wYmV1EIEUNDeKhpG+8fm8TAKBvqrgOPP1TbvROueNK3wCiV+Lon3Zj3LWUtvQNQGqmEhDAuGsJlycXMORcxHVrSvA1lZswOONOWaMMlh2PnpPLbKYw/1lRZzdgzLWU1HvkhcvJ5z8r6h0GuJZ8mOMcVrbGkb4Z7Gsti1oDvdysh7FEy5U4ihAH0Bn26LlxXN1REew+uFZTmbpKHC/2TuOq9oqEyzfVO4zw+gUmUtTQIB6HL0+j0lKKG+Ta1cWWB62Ur4s7gC6LXH7wRXlT5v40zkCH1oJ+qnsSwPoW5M3lJnj9AqMqOiay3PXY+XE0OIzYVGtN+WPXOQzwBwTGXYm/R567NAWbQYfNCdZ/XjWeCCUaWXEbn/Ogf9qNfS3Rj6lKLehiXnk7MzxXlKs4HEBn0OisB31TblwbpXGGUsquP8qs7IRrCYMzi0nV+m1UWVc4HV68PI0r28pRYdbDUqoruhnoJy9MoMFhRHulOa7fa3RI741wr9nh3mk4TCXojHNDaTxCuxE+1T2JlgoTmtcsbTareP+y3CaEwMv9TlzZVp6S7oNr1SvNVJIIWJ/vmcL+toqk85+BlVJ23EyFhTrSJ+U/q9mnIq28Fef7RwiBt//vi/jkA2ezPZSM4wA6g44NSB/I3VHSLpR8qmh50McGnDEfJ5ZsNVMZnHFjeNaDK+RlsZYKU1HNQHv9ATx7aQrXb6iKOzixGXWwlOrCHqgP985gX0t5yus/h6qzye/N6cVg++61uJRd/htyLmLCtYRdSRxfolEuxBLNgx6ZXUTvlBtXtadmtSUX6uKz3HO4dxqGEg22rqmWFY7UjbA4j3mDM9Lx4uL4fLaHknEcQGfQ0QEnSrQUte1suVkPk14bNQA52j8DnYawrSH2BzuSbJ00gqXW5FSD1gpzUc1AH+13Yn7Jhxs2xO4+uJZUAsywbgZ63CVtqEpH/edQNqMOxhItHjg5goVlP67tXL+SUucwQKshLmWXx5QL9F1NjrQ8vnIhlmgt6FTUfw5VbZXes5zCwUK91DeDnY2OiPWfQzWWGTHn8WF2sfjy6JXjRe/UQlb2VGUTB9AZdKzfiS11tqhdvYgITWWmqFezR/ud2FIf/XFisRqkmr2ZDqBfvDwDa6kOm2qli4iWChMGpt3w+YujlN2TF6TW11dH2EQai1QLevVrpqbUUioQEeocBpwcmg22716rRKtBvcPAAXQeO9bvRKlOE/yMpprNKHW1TDSF4/lL07AbS6JORMRDqyHU2rgWNFuxsOTD6eE51cdUZe9SMc5CKwG0xxsour0vHEBniD8gcHJoVtWsTlO5MWJNSX9A4PigE7tTMDtU74je2S4dDvdOY29rWTB3sbXCDF9AZK2kXqY92T2B3U0O2AwlCf1+uG6EL16ehrFEm9SKhFrKRkKlfXc4LeVmDqDz2LEBJ7Y12FXNvCWCiKRa0InOQF+ewv621KYrhVvZYcXr2IAT/oDAXpV1+huDAXTxvYeODziDx4reyeJJxwQ4gM6Y7nEX3Mt+VXmFTeUmDMy4wzbMuDAmPc7uJDYQKqTGHJkLXKcXlnFxfH7VVX2LvAmtGPKgZxaWcXJodl3pt3g0OEyYXfRiPqTM4eHeaexudqAkwYos8VAqFoTLf1Y0lZs4gM5TXn9A9YV+MqQymvEfe4adi+ibcqcsfSPZ8bDCdKR3BkRQvVFf6UZYbHs/lOPFK+Rylz0cQLN0ONbvBADsalKxo7fMBPeyP2zDjKPy4ySzgVAhNVPJ3BWzkv8cWmqtVa5EUQzd644NOiFEcqXm1payc3m8ODuifqkxWcoMdLSLgOZyE6YXluHiurp559yIC0u+QNoD6FpbYjPQSv7zgRQH0HX27NXFZ7nnSN80NtZYI66yreUwlUTc4F3Izo9Kx4tbt9bAUKLBZQ6g1yOiFiK6Rf5/IxGlvjhogTs24ITDVBK1o5FCKWUXrhLH0f4ZlJv1wWoHyah3GNfNZqbTi5enoddpsKNxJdWg2loKQ4kGfUXwwTs1OAsA2NqQeO7mSjMV6YLjpb4ZBJIMyuNx65ZavGFfI3Y2Rk4XWanEUVwnk0KgplJQKtQ5jBh3LcEb596H53umYDeWpLw+dYPDAK9fYHIh83XxWW7x+QN4WW6gopZSC7rYAuijcv7znuYytFaYOYBei4jeBeBnAL4q/6gRwK/SOKaCdGzAiZ2NDlWly5rKIy8HHRtwYleTuseJJVj/NEOz0Id7p7GrybGqfTURobXCjN4iqMRxangWbZXmhPOfgZWlQiX15nDvNHQaSnvAo9jeaMenX7czagOfYC3oIlhVKDRH+52otJQGL9TSpd5ugBDAWJybjp7rmcKVKc5/BlZSk4plLwaL7NyoCwvL/rhX9YqxlN3xAScqzHo0lhnRXsUBdDh/DuAaAHMAIIToBlCdzkEVmoUlHy6MuVQvi0bqRji76EX3+HxKNhAC0VtDp5qyq3l/mINSS4UJfUWQA31qaC7pjX5VllLotZpgCsfhyzPY2mCHSa9LxRBTgmtB569UXqBH01IhpW71Tqp/jwzOuDEwvRi2+kuylMmEEd5IWPRekhuo7G2Jb59RY5nUTCXc3qVCFXq8aKs0Y2DaHfeqUj5TE0AvCSGCybhEpANQPO+QFDgxOIuAgOrGBOZSHcrN+nVL4CcGnQCQkg2EQEgAnYFlp5f7Z+APiGD951CtFWb0TbsLOv9wemEZQ85FbE8ifQMANBqplNyQcxFLPj+ODTqxP46lxkywm6QSiTwDnV9m3V70TC5kZDWjo1oKoC9NqG++8EKPtIci1RsIgZW6+FyJgx3unUad3RD3KkxjmRHzS8VTC3rO48WlifngxGBbpQW+gCiqNBY1AfQTRPSPAIxE9AoAPwXwm/QOq7AEGxM0OlT/TlOY5aCj/U4QATuaUlOurMpaCp2GMpLCcfjyNDQE7Alzcm6pMGPZV9g1JE8OSfnPqSg1J9WCduPE4CyWfYGMbSCMR0sFl7LLN8eUC/Q0byAEpJUUq0EXVwD9XM8UHKYSbKxJ/RYcu7EEJr2WK3EUOSEEjvTOYG9LWdyrMEopu2LZ+3FiYBZCADuDAbT07788WTwdCdUE0P8PwASAkwDeA+ABAP+s5sGJ6HYiOk9EF4no7yPc5w1EdIaIThPRD9UOPJ8cG5hBa4UJZWa96t9pDFMK7Gj/DLqqLUnl0IbSyrOZmQigX+ydxpZ6G6xhxt5aBKXsTskBtJq2sLE0yPW7X7wsd3XMwQC6udzEKRx55mi/VLpre5QNoqlCROiossTV/vf5NOU/K+OpdxRfDitbbci5iNE5T0LHVGXvUrG8h47LF9w7Q2agAaBnonDP42tFDaCJSAvgrBDi60KI1wshXif/f8y1dvl3vwzgDgBbANxLRFvW3KcLwD8AuEYIsRXAXyb478hpSp5QPJrKTBh2LsIvpzUIIXB0wIndKsrgxaPenv5mKsu+AI72OyMelJrlALqQW3qfHJxFS4VJdVmkaBrKpAoGz16aRFe1Ja4Ls0xpKpfyAf0FnJZTaI4NONFVbQl7kZsOHVUW1TPQQ85FDM4spiV9Q9FeacalPD/5904WXzvlVEo0/xkovmYqR/udaK8yB89pZaYS2I0lRbWRMOrOIyGEX55BbhZC9Mf52PsBXBRC9AAAEf0YwN0AzoTc510AviyEmJGfbzzO58isgwfj/pURvQVje/4Uu375XeB/Pqj695qqd8LbfitGb78LDcsu9BoccO56F3b/5OvAl07GPY5IGjruxAu2RuDgP6bsMdc6aanH0ra3YP//fh747IV1t9eBoN//l+j90jeA/ifTNo5sOrXrXdi5MAoc/GjSj9VQtRWi40481z2BN46fBA5+OAUjTK3m6h1Ybr8t+P5lkXlJg+/X7MK9Y8dhEP6sjEEAOLb3fbhtpjtj76fO+v34efMNcN18K6z+9TXvQ50q6wQ2/gF2f/IfgIXR9Iyn6To8WncFvDfehBKRfxuhBkptuGnXu/BfF3+Lu6bOZXs4eelw6y2wVG7Bpre8BvFu9bIDsO57Pwa+9QPgQ4fSMbycIQAc2/NnuH72MnDwIwAAAtC29S3ofegi8Pn3J/7gjz+eghFmhpoUjjIAp4noEBHdr3yp+L0GAAMh3w/KPwu1AcAGInqGiJ4notvDPRARvZuIjhDRkYmJCRVPnTuOWeoBALvmR+L6veYlJwBgoNSe1OPE0rA0i1G9FT6kb9f9i7ZGAMA+12DY27UQaFqaRV9pbm2GS5UZnQGDBge2z4+l5PEaluYAAAHSYH+Ev2m2NXuklJV+gyO7A8kDT9tb8NHWm3GorCNrY+g1OOAsMab8+BJNx6KUgtRjiL1cfs5UBRICGxan0jaersUp+DRa9JU60vYc6XTKXIsAaXDKXJPtoeStI9ZG7J4fhi7BOglNS7MYLE1/ClS2DeltmNSbsWt+9cVsu2cGlw2FeR4PR03tq39J8/N3ATgIqb70k0S0XQjhDL2TEOJrAL4GAPv27cve+lQCV0bHHjgL/TO92PzrHwAh9Y9jaZpcAD7zOAY+/Clcta8JR399CuaXBtH1258CKcwBbHlpEP6fHkf/T3+D9iqLqt/pmZhXfV8AOPztw2ifWkDVww9EvE/rtw+j17kIfF39LH2+ONU9AXzzRWz/+N8BnZFbYKvVOOUG/uMxAMAVX/8MUJZ8U51Ua5bHOPAvn8SBK5qyPZycdu7xS8Dvz6H7z/4GuGVDVsZw7Ogg8JPj2PX5fwXqkqsUo1bHxDzw2Sdw6eOfwc49jVHve+77L6F11AXTow+nbTydg7PAl57Gxc98GZ3b6tL2POly9uELwKFudN/4SuCdya90FZvZRS/Of+wh3HHzNcAt70voMRq/e0Tay/PVv0jx6HLL8RMjwA9fxq7PfhgIKY7Qeqgbv3j4AhYfOgSjXn28k69izkALIZ4I96XisYcAhJ45G+WfhRoEcL8QwiuEuAzgAqSAumAcHXBiS71tVfMQNeodRhCtdCM82u/EziYHtCneQNNVLQXC3So38zx7cRI3ffYJnJS76sUSCAgc6Z0OW/85lFK1oRBraJ4MbiBMTWBSazeASGpG0ZiDwTMA1DkM0GqIK3GocG5UWlFQ+xlMh2P9Tpj0WmxIQ4WLSJrLTdBpSFUe9LlRV8q7D66llNaLZ2NjLjmfA++jfPZy/wyEAK5IoixoY5kJA9OFXwv62MAM9DoNNtWuPqe1Vcr13Qu4IEAoNZ0IXUQ0J395iMhPRHMqHvswgC4iaiMiPYA3AVib+vErSLPPIKJKSCkdPfH8A3KZzx/AycHZuDcQAoBep0GdzYDBaTcWl/04OzKXlvqsnXIArfakoZS6UoLCWC5PLWDO48OeGJsyWitNcC/7MTFfeK10Tw3NoqncCIcpNZv99DoN2irMuCYFs9npUqLVoMFh5ABahfOjUo74xbEsBtADTuxotKf8Aj2aEq0GLRWmmMce97IPvVML607WqWbS69DgMOZtAHpOfh8NzizCvezL8mjyz0u9M9BqSHW/hnCayo1Y9PoxvRA9pz/fHRtwYmu9DXrd6hAyGEAXyUZCNTPQViGETQhhA2AE8FoAX1Hxez4A7wPwIICzAO4TQpwmoo8R0V3y3R4EMEVEZwA8BuBvhRDpS3LLsAtj81j0+hMOfBvLTRiYcePU8Cx8AZHyChyA1LSlwWHEhTF1G70uyAfpeO+/OcbJT+lMVoiVOE4OzWJ7Cuo/h/rRu6/Ch169JfYds6g5TClGttqyL4BLE/PQaQg9k/PwZaGLl8frx5mROexKw/ElFqkSR/ST7flRF4QANtWlf3a8szq+0nq5YmHJh74pN7bI6TeXxosjgEmlw73T2FpvS6qrayYqcSz7Ani5fyZtjx+L1x/AyaHwE4NKAN3DAfR6QvIrALepvP8DQogNQogOIcQn5J99SAhxf8jj/ZUQYosQYrsQ4sfx/gNyWbCBSoKNCZrk5aCj8oclmSvjaLpqLOhWOft1Xr5f97i6APr8mAtEKzPdkQRrQRfYB2/W7cXA9GJKGqiEqrEZMlZuLFFNXAs6pp7JeXj9Atd0VsLrF+jLwt/r9PAcvH6R8HEqGR3VFvRNLURt/3tO5UV4KnRWS6X18q0UnDKh8eqd0mbzixNc+SYePn8AxwedCZWvC9VYJtWCHoijFrQ/ILC4rL76zs9fHsQ9X3kW/VmabLow5oLHGwh7vDCX6lBjKy2aUnZqUjjuCfl6HRH9GwBu16TCsYEZlJv1aC5PLE+1qdyIMZcHL/RMo7nchEpLaYpHKOmSTxqxavb6/AFckmdnLqgMuC+MudBSboq5oaDBYYROQwU3A31qWEp1SfUMdD5oLjdhamEZ80u8nByJkr7xqh3SpjW1F7KppFzoZ6KF91odVRZ4/SLqhda5kTmY9dpgcJJOndUWeLyBvGvprVxk3L6tFjoNZeV9lM/6p93weAPBGfxEKe/ReGagv/zYRbzic0+ozps+Myxl0J4ZUZNJm3rB40WEFavWCjMH0CFeHfJ1GwAXpHrOLIZjA07sbLTH3RJU0VRmghDAU92TaT25ddVYseQLxJwt7J1awLI/gE21Vky4luB0x87zOj/qQpeKjUk6rQaNZcaC23wQbOGdgg6E+Ua5cMzWTEk+ODviQomWcOvWWgBAt8rUqFQ62j+DersBNTZDxp9bWZmKlsZxdtSFTXW2tHQgjDSefEvjODcyB0upDq0VJrRVmvM2jztblAmhZDfRWg0lcJhK4upGeKRvBoMzixibU7f/R1ltyMaxApA2HJeb9cHOi2u1V3EAHeobQoh3yl/vklMxCqpSRjq4PF50j88nlVfYJAcgy/7wyyWporYSx/lR6XZltizWLPSSz4/eKTc2qjwotVSYC24G+uTQLBocxpzsFphuwQCa0zgiOj86h44qC+zGEjSWZWcD27EBZ9rSw2Jpr5JyJiNV4hBC4NzIXNorcCg6q/IzgD476sLGWiuICF01+ZnHnU1KMBor1VCNxjIjBqbVz0BflJ/7vMqAWDlGqL1/qh0fjD4x2FZpxvTCMmbd3gyPLPPUBNBfVPkzFuLk4CyESC5vOfQKb3dz+jb4dAYD6OgfyPNjLmgIuGO7EkBHv3/PxAL8AYENKk9+rRUm9E4tFFQJoFNp2ECYL5QAmvOgIzsfUp6tq9qS8QB6cn4JgzOLWcl/BgCboQTV1tJgathaI7MezHl82JSh2tRlZj0qLfq8CkCFEKveR51VUl65x5udrpb5qHt8Hg0OI8yliW8gVLRVqr+AcXm8GJ6VMmKVDffRTM4vBSt8qN3In0pqJgbbKqV44nKBrSaHEzGAJqIDRPTXAKqI6K9Cvj4CoPArZCfpqLKBMKTIeLxqrAbotRrodZqkc7OisRpKUG83xMybuzDqQmuFGe2VZpj12pgHCeUDHs8MtMvjw0yBXLnOLnrRN+XG9sbiDKDtphLYjSU8Ax3BrFs6eW6UN8d11VhV7UVIpWP9TgDpvUCPpaPKgosRZqCVGtmbMzQDHWs8uWh0zoPZRe9KAF1jRUCgaJbRU+HCmAsbapKffQaATbVWDDkX4fLEPo+FnkPPqQiglXPq9gY7eiYWsOzLbNUeNRODSiWOy5P58xlKVLQZaD0AC6RugdaQrzkAr0v/0PLbsQEn2ivNsJsSr5Sg0RAay43YFqbeYqp11lhjzkBLBxlpmbCzxhrzCvj8qAs6DQU/ULG0VsqVOArkyvW0vIEw1RU48gmXsotMWYINBj7VFiyr2IuQSkcHpNq32czR76g249L4fNiVp7Mj0t9I7SpWKnRWW9A95sqblbBz8t9ImaWPtzlWsfP5A+iZWFC1V0cNZcJIzQyx8ho1lasrJasE3K/aUQdfQGT8IkmZGNwZZVKoudwEDQGXY5SnLAQRozK54+BHAVwlhPhoyNd/CiG6MzjGvCOEkPIKU7As+onXbMeHX701+UHF0CXXP41Uvsnj9aN3aiF4IttQbYmZA31hzIX2KrPq4H+lFnR8H7wvHOrGFw/l3lvyVHADYWaWn3NRM5eyi0iZXVXqG2cj8Dk+MItNtdastt3tqLJgzuPD5Pz6TcnnRl1oLDPClsGSjV3V0njypanTWfl9tFE+NrdVmqGhldxaFl3/tBvL/kDw85cs5XVQM6N8cXwepToNbt5Ug+5xV8zVpwtjLlgNOlzXVRX8PpOODzjRVmmO2hRMr9OgscxUFLWg1UQ2biL6DyJ6gIgeVb7SPrI8NuFawoRrCTtSsHR/oKMCOzOQn9gll2+KVH7n4vg8AmLl6npDjRWT80uYidJx6fyYugocisYyqX1572R8AdfPXhrET44MxPU7mXByaA71dgMq0lR+MB80lZswOLOY0bSEfHFu1AWbQYdaufqF2r0IqXR+zJXW9DA1OqqUShzrLxykDYSZHV9ntXTMypc86POjLjQ4Vi4yDCVatFSY8yoNJZtSVYFD0VhmhKVUFyxRGU33mAsdVRZsqbPB4w3EXK27MDaPrmoLOqrN0Goo8wG0vIEwlrZKc8GsJEejJoD+AYBzANoAfBRAL6Q23SwC5cClHIjzgRLoRjp5KweDjbUW+f7SfyN9gN3LPgxML6rOfwaAUp0W9XZjXDPQHq8fAzNuDM4sYk5FzlkmnRqaLer0DUCagV72BzA2x6Xj1zovl2dTdrNbDSWosxsy1tJ7zuPFhGsJ7VWpmXlLVEd1+ADa4/WjZ3IBmzPQgTBUsLRengTQ50Zc66qUSGko+TH+bEtlBQ4AICJsrLWqzGmeR1eNJThrHS3oFkKgW06jLNVp0VphUhWkp8qs24uxuSVsVnHB3VZpxuWJwioIEI6aALpCCPFNAF45reOPANyU5nHltR4596ejWl3uby7ojLF8fGHMBb1Wg1Y5zUK5Wr8Q4f7dCV7Vt1aa4urG1jflhvIZzeTBJJY5jxeXJxeKtgKHQukw2VME+XDxWFs5QdGZwUocweNUVXaPU3U2A0x67br20xfHpQ2VmZ6BrrGVwlKqy4sc4iWfH5cm5te1Oe+qtuDyZPQOj0ySygocio21VrkFfeQAcmHJhyHnIrqqLTEnpABgcn4ZM25vcLJrY23sfUipdEneFKjmgru9yoyFZT8mXPmRBpUoNQG0Mq03QkSvJKLdAMrTOKa8d2liHia9Nrg0mw/sxhLU2EojfiDPj7nQUW2BTiu9ZersBlhKdRHz7JQNUhvj3PwTby3o0Fmrs1nqzBSO0i1qW5FW4FCs5APmzmuTCwZnFjG/5Fv3+eiqtkbdi5BKygxrtmegNRpCe5V53Qy08nnO9Aw0EaGzOj9qKV8aX4AvzEVGV40FvoCIez9JMUplBQ7FplorZhe9GI2y8nYpZKXapNehudwUtbazsjqsjHVDjRV90+642oAnQ7ngbldxwa1MtBV6HrSaAPpficgO4K8B/A2AbwD4YFpHled6JhbQXmVOuANhtmyosUY8aZwfdWFjyEFGOclE2kh4YdSFUp0m7jbmrRUmqQj7orp0jB75IGQp1eVUAH2qiDsQhqqwlKLGVhq8oGASZbUkXOCz6PVnpJV0z+Q8dBpCS0V8n9F06KhaH7CeG3XBUKIJbi7OpHwJoM+Phb/I6JLTBzmNIzqlAkeq8p8VSupitDQO5bXpCgmIo62irl3V3VhjhRCZy9XvmZCOF2rO6Urlrd5iDqCJSAugSwgxK4Q4JYS4UQixVwhxf4bGl5d6JufRXpndWZ1EdEaoxDG76MXIrGddKakNNZbIOdNjLnTVWKCNs/2ucrJU2/750sQC6u0GbGuwBUte5YKTQ7OotRlQZS3eDYSKLXU2nMmhi5tccG5N5QSFMruUiY2El8YX0FxuQok2vSUy1eiosmDIubhqNu3c6Bw21ljjPoakQme1BeOuJdUX8tlybsQFvW4ltU6hzBLmQxpKNikVOFKV/6xQLoyjBcQXxqW0yBY5IN1Ua8XlyQUs+cLPKCsVOKrlc4pyPs5UGkfPxAKaK9QdL+odRuh1moKvRR71LyGE8AO4N0NjKQgerx+DM4uqljlyTVe1Fe7l9bNf3REaokiVOJaDnZFW/848NiSwiVI5EajdwdszMY+Oags21dpwfjR2GaBMOckbCIO21NtwcXw+4omhGCnl2Sxr8i47qzI3c9gzOZ/19A2FUomjR86zFELg7Igr4/nPinxp6X121IWukNQ6hUmvy1pr+HyS6gocCrupBLU2Q9QA+uLYPNqrzMHXbkOtFf6AiLhfpHtsPtiHAQBayk3Q6zSZC6DjmBjUagitFYVfyk7N1MMzRPQlIrqOiPYoX2kfWZ6SWlGvnBDyiTL7tfakoeRlrT3IdEUoGD/rlnK/Eml+oCwPqcndE0Lg0sQC2ivN2FJnw6LXnxNNO+aXfLyBMMTmOht8AcHLySHOjYYPDu0mqbV1ooHP957rVZUu4w8I9E66c2ajszKOS3LwMCG3LF67OS5TlGX1XK/EcX40cpm/LrkhDIss1RU4QsWqxNE9Pr/qeaM1YBFC4ML46lxtnVaDjipL1LzpVAkeL+KYGGyrNBf3DLRsF4CtAD4G4LPy12fSOKacc3ZkDu/+7hFVbwZlJ3k+zkBHqkN7YdQFs16LxjLjqp8Hl5vXfIAvjMfXwjuUUa9Fg8MYs0kLAIy7ljC/5ENHtSVYWicX8qDPjcxBCGBbQ/E2UAm1JYdem1yw5PPj8uTCugociq6axCpxeLx+fOj+0/jm05dj3ndwRlq67siRVLPWCjOIVgLWYHe9LM1AN5ZJs3u5XEt5emFZLisW6X1kRc/kQs6syuWidFTgUGyqs+LS+HzYSiiLy1L51a6QVdq2SjNKtBQ26J6cX4bT7V11fwDYWGPBhQxUn1KOF/HENa2VZvRPuQv6/RczgJbzntd+FVUZOyLgoTNjODYwE/O+yqY2te2rc4nDpEeVtXRd8Hpu1IUNtdZ1myJrbQZYw5R7UpatEm2/u7XehlNyG+xolF3M7ZVSGSAN5UaQpixb5eMqRDq0VJhhLNFyHrQsWJ4tUuBTbcXFBFpJ909LJR1Px/HZyZUZaEOJFk1lpuC4gl0aM9jCO5RWQ2ivNOf0DG6kPHpFNlrD55t0VOBQbKq1YtkfCLuR7tLEPITAqufW6zRorwwfEHdHWAXeUGvF8KwnoR4IXn8A//yrk6reHyslL9X/rdorzVj2BzCcgQ3R2RIzgCaiGiL6JhH9Tv5+CxH9cfqHljs6qywwlGhwaih2ANAzuYAGhxEmfeqvaDNhw5rZLyEELoy5ws4mExG6aizrlpwujLlgKdWh3p5YGb9tDXZcnlzA/JIv6v0uhdTbNpRo0V5lyYmNhH1TC9BqCA1rZuyLlVZD2FRn5UocspXZ1ciBz8KyHyOz8TWfUVbIusfn4fFGzzcPrpTlyAw0sLryxbkRF2ptBpSZI7cMzsh4cngGOtYsfTZaw+eTdFXgUGyskV6XcDPKyipv15rgfUOtNWxKhnKOXXt/5bycyIXe+VEXvv98P351dCjmfYOTVXEE0G2Vyr6Gwk3jUJPC8W0ADwKol7+/AOAv0zSenKTTarC5zoaTQ+pmdvIxfUOxdvZrYn4JM25vxINMV7V1XW7rBbkCR6Jl/LY12CBE7NnknjX1tjfVWnNiBrp3yo2mMmNOVDfIFVvqbDg7MlfwnanUOD8WvnKCItHAR9k34A+ImF3QeibnUW7WZzVAXaujSsqZ9AcEzo66spb/rOiqtmJwZjHmxUi2nB91odKij1jpJxut4fNJuipwKJR22+E2EnaPKSUkVx8DNtZYgjXiQ10Yn4ctpAKHItjQLIH9JcpGfTVxTc/kAhymEpTHcbxQVuEv5/BFaLLUnOErhRD3AQgAgBDCByA3jyhptK3ejjPDc1EbHAgh7aDN56V7ZfZrWJ79ujAqvfkjLRN21VgwtbCMqXmp45DSYS2R/GeFUjv5VIwP9qU19bY319kw5Mx+S+/eyYWs1K7NZZvrbJjz+DJS3zjXnYtQOUHRleCs0uVJN0q00mch5mdnfCHrHQjX6qiyYMkXQN/UAi6Ou1S1DE6nzmoLhFjfYjxXnBudi9qoKtOt4fNNuipwKEp1WrRXmsM2kbowNi/nPK8+BmyUVxPWrupeXFOBQ9HgMMKs1ybUhVdpWBbrWAFIexPa40xLrbToYS3VFfRGQjUB9AIRVQAQAEBEVwGI/RcvMNsb7Jhf8kUtr6ZsasvnGegNa07esToKrr0CVtqNJnNQqpbrJ8dKmbk0Pr/qYkXZrHYui2kcQgj0TbnzMgc+nbbUS68Np3FIm0yjBT7lZj0qzPq4S6j1TS1gW4MdDlNJzDzoXKxV3yHPBD58Zgxev8ha/rNCmZnMxVJ2/oDA+bHYZf4y2Ro+36SzAociUiWOi+OusOfIYCWOkN9RKnB0hbm/RkPorEmspbeSmz086wlOgEXSM7kQd8lLIkJblTmn06CSpSaA/isA9wPoIKJnAHwXwPvTOqoctFWuqHAqSgAQuqktX3WtOWlcGHWhwqxHpSX8MmEw4JaXCS/ECLjV2t5gjxoELC77MTy7uOpvrSz5ZjONY3J+GfNLvpzo7pZLNtVaQYScyFHPpumFZYy7lmIGh4kEPr2TC2irMGNbvT3qsuys24vJ+eWc2UCoUC6Gf3tyBACyPgPdWmmChnIzgO6bWoDHG4j5Pspka/hUWFz249iAMyPPlc4KHIpNtdZ1KRkeudxquMC9scwIY4l2VR70xPySXIEjfFyxMcw+JDX6ptwwlEghYLTjhcvjxYRrKaGV9a31NpwaKtzUPTVVOF4GcAOAqwG8B8BWIcSJdA8s12yosUKv1URd7ugJ2dSWr8rMelRa9MEP5Lmx8FfKihpbKawGXfD+wQocSS6Lbau3Rd0MdXlSrrcd8reutRngMJWEXTLLFCUPNVJ+a7Ey6XVoqzDjzEjRLV6tslJdInpw2FUj1fBVe+LxeKW0q9ZKM7Y2SE2Fln3ry2cBwKXJ3LzQLzfrUWYqwYnBWei1mqyv4pTqtGitMOdkAK0cZ2NdZHRWZ641fCr86MV+3POVZzA2F98G2kSkswKHYmOYjoQ9EwsIiPUbAgFpRnlDjWXV/de28F5LaWg2GWMWea3eqQUc3FANIHoahxLXJLKyvr3BgdlFb070Z0gHNVU4DAD+AsDHAXwUwJ/LPysqJVoNNtVZcXIw8hvt0ppNbfmqq9qKbnnWonvMFXU2mYiwoWZlI2H3uAvlchCejK0NdmkzUYTZ5J4wQQARYVOtFWeyOMvZK+eVtXIKxzqb63Or3Xo2KCdGNTOHcx4fJlzqTorKCaqlwoRt9XZ4/SLirJRSa7kjjUvXiVJmuTqrLTmxCbcjpDJILjk76oKGYqcfdEVojpWrLk3MIyCQ9lnodFfgUCif81UBsVKBI0Kn3o21q1MyLgRL2EWYgU6gpbd72Ydx1xK2N9rRVmmOOgOtnGsT2TOxo1Haz3Q8StyUz9Qcob4LqZHKFwF8Sf7/76VzULlqW4Mdp4ZnI84K9azZ1JavumosuDg2j8GZRbiX/TEPMqGl787LG6SS/RsobbAjpcxcGl8A0fp625vrbDg/Ope14u29k1IJu7VNZ5iUo94/7c76Js9sOj8qXWBGqpygUJZr1e6uVzbqtFWag5+dSClQlyYWUKIlNOXge1QJoLNdgUPRWW3B5cmFsM0wsuncyBzaKqXyndHkS0tyhXIheDzNAbRSgSNcXnEqNZYZYSnV4XzIqmj32Dy0Goq4wrJ2Rrl7fB52Y0nEY8ZKKTv1r7GygbClwiTFNVH2G10al85pzeXxB9Aba60o1WlwIkNpOZmmJoDeJoT4YyHEY/LXuyAF0UVne4MdLo8v4nLEpYnc25iTiK5qC1xLPjzRPQEgdj5zZ7UV0wvLmHAt4cLYfNL5zwBQbzegzFSC0xGujC9NSPlrRv3qE8jmOhs83oCqVuDp0Du1gEYuYRdWLmzyzLazcoWaWBeYnTXxlSBT3u8tFWa0lJtgKdVFPCn2TMyjpcIcsQpINikzqpuz1IFwrc4qC3wBEQw4csX5MRc2qcgRL5P3r+RLKTulqcfxQWdan0e5MI2UV5wq0gqtZdVGwu5xF1orpE6X4aydUe6WU00iHTOqrKVwmEriaukdmmq4vUGqXjW9sBz2vj2T82gqM0YcbzQlWg221NtwQkWlj3yk5i/yslx5AwBARFcCOJK+IeWulfJq609MHjnPLJ8rcCiUq/LfnhgGEHnpSKHc/sSFCcwv+VKyLEZEwRn/cHom58PuClZOvNlKFeid4hJ2kSiVOHKhVnc2BAICF0ajp0QpqizSSVHtRsLLk26Um/WwG0ug0VDUbp6XJuZzroSdQuleqmzazrZcTIFYWPKhb8qNzSonKrrypBKHzx/A4IyUq31icDatGx8zUYFDsbHWhvMh+xm6x+Yjpm9I919J+5Aamc2jM8r9iQgbqq1xtfRWUg2b5RloIPJGwmRL8+5osOPU0GxBtvRWE0DvBfAsEfUSUS+A5wBcQUQniaioNhNuqLWgREth32i9U/KmtjyuAa1QrspfuDyNBocRVkNJ1PtvWBNwp2IGGgC21tvDboZaqbe9PghQWnpnYyOhEAJ9k260cgWOsKqtpSg364u2lF3/tBuLXj82q0hPICJ0VVtU1/Dtm1pYVfllW4MdZ0fm4FuTeuD1B9A/7Y67JFWmXN9ViR/8yZU40F6R7aEAWDmep6IW9KmhWXzqgbNJVyRQgmG1ExVKSl6uV0IYmfXAFxDY21IGl8eHy2lcRewen0djWXorcCg21VrhdHsx7lrCks+P3qmFsBsIFVWWUpSZSnBhzIUJ1xJmF72xJ7FqLauC9Fj6phZQYdbDZihZSZcME9cEAgKXJxeSmhjc0eiAe9mfs/XUk6EmgL4dQBukShw3yP9/O4BXAXh1+oaWe0p1WmyosYZ9owVb4+bozE48KixSoCNE7NlnQAqMbAYdnuqeBABsiHK1HI/tDeE3Q43OeeBe9ocNAlZaemc+SJteWIZryccVOCIgImyps+FMkc5Ar2wGUvf56Ky24sK4upOiUsJOsa1BSmVS2t0rBqbd8PpFzl7oExGu6azMmX0k5lId6u2GhFolr/XrY0P46pM9wdm/RCkphGo3KispeWNz8VVpyDQlfePVO+oApDcP+sKYK+3pGwplQunsyBwuT0oVOKLNfCsb88+PulQ3e9lYY4XL48OoyuolfVPu4AW3zVCC1gpT2AIJQ85FLPkCSV1w72ySNxIWYB60mjJ2fQDmANgBVChfQog++baisj3CRsIe+eoq26WXUkU5uGxQMZusfOB9AYFamwF2U/QZa7W2KbW311ywKBcrkZahN9dlp9qD0mSntZJnoCPZXGfF+THXupnRYhAMfFReYHVVW+B0ezEVITdREVrCThGpm2cyJamK1YZaa0qOJwPTUnrCy30zST6O9D5Su1FZWf7P9Txo5fNx06YamPTatAVcmarAoQitxBGrJJ1CqsQxH7zojjZjHfp4ajsS9k25Vx2HtjWErx2/0tsi8eNFe6UFZr1WVcvwfKOmjN3HAZwA8AUAn5W/PpPmceWsrQ12ON3eYK6WQtnUZtKnf0koE5QPrNqW3EredKwPejyay02wGnTrcjmVsjqdEa6KN9dZMeRcxOxiZqs99E4qO5s5OIlkS70Ny74Aegq4vWskgzOLMOu1cKi8wFQ+S7F214eWsFO0V1lgKNGs++woJ8SOAtjsnCm7m8pwYdyVdPWYQaf0Or3cn1wAPTjjRqVFr/pco7yPEmn3nEl9027oNISGMiO2N9jTVvosUxU4FA6THrU2gxxAS+UHY020bay1Yn7JhycuTMBhKkFVhEZmig1xVOKQLrgXV52ntjfYMeRcxMyai/WVC+7EjxcajbSfqRBL2alJ4XgDgA4hxEEhxI3y103pHliu2h6hRFRPknlCuUb5QKrNZ1ZmrNUG3GoQUbCTUahL4/OwlOoilvVRNhKeS1GqwMC0G5/+/bmYmyB6pxagIaCpjGegI9lSJ31+ijEPenDGjaZyk+r0BOWzF6std2gJO4VWI6XLnF772ZmYR6WlNGWrRMVgb0sZhEh+CVqZgT7an/zjNMRxjKm0lKLebshYh79E9U+70VhmhFZD2NnkwJnhuYjNgJKRqQocoZSW3t3jUgWcWOUHlfPo0xcnVZWFLTPrUW0tVVWJY3DGDSFWX3Bvj7CRsGdyHjaDLum+DjubHDibptczm9QE0KcAONI8jryxqdYKrWb1RsKVTW2FM6tzz55GfPb1O4Olx2JRAu5UL4ttq1+/GeqSvIEw0kFF6c51LkUzLv93YgRfefxSzCWo3ik3GhIs91Ms2qvM0Gs1RVmJY3BmMa764NVWAxrLjHgpxpJ/aAm7UNsb7Dg9vLqagVKrnqm3s8kOIsR8HaKZ83gxu+iF1aDDudE5LIS0do7X4Iw77hree1rKkhp/JgxMSxeYALCz0YFlfyAtm8EzWYFDsanWiosT8zg36lL1vMrsuD8gVM+Ur23AEsnKSulKAL01UgA9sYD2quT7OuxotGPZH8j5VZB4qTnTfwrAUSJ6kIjuV77SPbBcZSjRoqvasmpWdNy1hPklX0GdmCylOrx2b6PqD87+tnJ88JYNuH17bUrHsa3BjiXf6s1QPRPhS9gpamzSLuZUBWmDM9IB50jvdNT79U0t8AbCGEq0GmyotRTdRkIhBAam3WiMc3ViX0sZjvTNRN1IGFrCLtTWBjsWlv3B3HxAKWFXOBf6mWA1lGBjjRUvJzFzPCjPPt+xrRYBkXidY39AYMi5GAw01drXUoaRWQ+Gc7ild//0ysa24MazNCz7Z7ICh2JjrRXLvgAuTy6o2phvN5agzi51NN6gMtDvqpYC6Fjl/3pDakCHPl9LhSnsnolUxDU7Gx0AgBNDzqQfK5eoCaC/A+DfAfwbVnKgP5vOQeW67XJdQ+WktpJoX7wnJr1Ogw/c0gVbjJJ38Vq7kdC97MPwrCdqHVuppbctZQH0kHzSiTaDI4RU7ocD6Ni21NlwZngu58tqpZLT7cXCsj/uDpV7W8sx4VoKLv+Hs7aEnSK4kVBOl5leWMaM25uzNaBz2Z6WMhztm0m4NvGAfBH+6p31ABJP4xib88DrF3Gnie1tKQcAHMnRWejZRS+cbi+a5QuDBocRFWZ9WjYSZrIChyI0FTJaDehwv6N2VXdjrQUebyBiozdF35QbNoNu3V6MtRsJ55ekqh6puOBuLDOizFSCEwOFlQetJoB2CyG+IHchfEL5UvPgRHQ7EZ0nootE9PdR7vdaIhJEtE/1yLNoW4MdUwvLGJmVSsYos6Md1XxiSrW2SgtMITt41W5q2FwnFa9PRfF2ZcPo4d7IM4Ezbi9cHl/YQIattrnOhim5c2WxUAKoRGYOAeCl/sirH2tL2Cm6aizQazXBi0+lUhDPQMdvb3MZXEu+hBuSKJUzttXb0V5lTrgSR7wVOBSb66wwlmiTrgCSLsq/SwmgiaQ86FQH0JmuwKHorLZAq6Hg/6uxMbgxX91YlXrOsXLde6cW0Fq5PgVye4MdgzMrGwkvK+faFFQWIyJsb3SkvcNkpqkJoJ8iok8R0QEi2qN8xfolItIC+DKAOwBsAXAvEW0Jcz8rgA8AeCHOsWfN2sLjPRPzMOm1qLUZsjmsghTcDCVvpLqkMgjYVGeFxxtYtXydCCEEhmYWYTPoMDm/FPHqPtxGLhaekld/uojSOJSLsHgDnw01VlhLdTjSGz7wUUrYhav8UqLVYFOdNeQ4xSXsErVHvpBJtIJGaAWWPc1lODrgTGgFZkB+H8V7IabTarCryZGzedDKcTX037Wz0YGLE/OYTyJffK2RWQ+W/YGMfwZKdVq0VZpBpP4C9g+vbsWnX7cj4mb5tTbV2mAt1eHFmKmG7rDHC2UjoVK5R6l2laqmSzsb7egen8fisj8lj5cL1CQB7Zb/e1XIzwSAWJU49gO4KIToAQAi+jGAuwGcWXO/j0NKEflbFWPJroMHAQBbNDporvgATn3qi7h18Bn0bHot2nUm0I03Znd8BWpby024r3o7AgcP4lLjNdA0XIWWe18DiMgfxC2mamDH23H2PX+NjunzCT/3tM6IxX3vw2sGj+NHNTtx5D1/h5bJ0+vu11e5Beh8JVo++KeAJ/oBrNht0pYCV/wFzn70M7hx+MVsDycjBuquAFoOovFtbwD86mfetQB2bXodXnp0BPj8+9fd3m+sAHb+EVr/+7PAv55bd/vWtlvx24qNEAcP4lLzDdDX7kHja18F6RDO1GoFUL73z/HS//wQ9/7d7+P+/cENf4CmUjvoxhuxp3oHftZ+G/pu/wO0Ljnje5yGq0GNV6P+9XdFPf6Fs7fxWvx3w5VYuOkVMAcyW+Izlv66/UDLDWh+2+sBvzQDutPeCrH59Tj5+nfiwNxASp5n0NYEbHkTGj/+z8Bcf0oeU63d7bdDZ66F8dabVd2/AVIJNLW0APZufC1efGw07LECAJZJg8H9H8TdJx4BvvrBVbdtk4/LJz/8H7hu+EXV51q1dpR1wL/xHpy5523YOz8c+Y6PP570c2WKmkYqN4b5UlPGrgFA6Lt+UP5ZkDyT3SSE+G20ByKidxPRESI6MjExoeKp08sY8KFzcQqnzDUAgEuGcrRz0JQ2W91jcGv1uGwoQ4+hDI1LczDE+EB3Lk5BKwI4Z6pK6rkHS6Wr8hucl2HzeXDE2hD2fr2GMmhEAE1LhZXjlQ52/xIaPU6cMVVneygZM1hqh83ngT2O4FmxzzWE86ZKzGrXz0RdNkgzo22e8DOL2xbGMKczYLDUjkuGcrR5ZqDl4DluBGCPaxgvW+sT+v0Bgx2N8rFhj0sKHhJ5rAGDHbXLLpQmENDsnR+CnzQ4bkntRu9U6DM4UO51w+pfqUO8Y2EMAHDcXJey5xnWS6tf9cuZX/36cO8h/ODsT9L6HFe4BnHRVIlpXfiVrqFSGwKkQYvHue42u38JzR4nTpml94fac61aO+ZHASAn33+JijkDTUQ1AD4JoF4IcYechnFACPHNZJ6YiDQA/hPAO2LdVwjxNQBfA4B9+/Zl7+gfcmW07b5jeKp7Ep4v/TmGPvR7vO5VVwC35P4kej7aNjIHfP4pnPq3L+HSEz3osJUC//V41N8xAGj/zydwdvPdwDv+NeHnHjo5AvzgZTR/+bPY8+A5HJmpBP73H9fdr/dHR1HfP4PSxw4l/FzFZMt3j+DMRAPwjb/J9lAyYuB/X0STawn4zONx/+6+i5MQ33gBR7/yPRzcuPqio+/JS8AD59Dys+8DxvUbeLcPOoEvPYNTn/kf9Dx4XtqY9IX4x8CAPY9fxCO/P4+Z3z6EMrP6urhCCAx++EEc2HcFcNcH0BUQsHz0Ibz81j/FPa/ZHtcYBr76HBqFAD73eJyjB/a4vcDHHsJLH/gQrr65K+7fT6eBb7yApiUf8NnHgz8rB9D86cdwYvsbgbd8OiXPM3SoG3j4Aup+83MgRi3mVLPIX+l0Ze808D/P4fDn/xe3bV0fqPaeHwf+9zBaP/NxoLV83e3bf/AyTgzVAV//K1z6/FNoV3GuVasGQM0nH8GJ17wNeNN/puQxs01NDvS3ATwIQLlcvgDgL1X83hCAppDvG+WfKawAtgF4nIh6IaWI3J83Gwnr7ZhwLeGFy9MQgjfmpFNntQV6nQYnB2dxeTJ6CbtQW+ttOD64vu16PJQSdg1lRlzRWo7u8Xk43etbK3MJu/hsrrPh8uQC3Mupy2/MZfHWgA61q8kBDYVvAR2phJ1iQ40VOg3h6IATfdNuPk4lYU+zNNt/dCC+POIZuQKLkt8rNQqx4+U+Z9xjGJx2J9yoyW4qwYYaC15KshNiOvRPu4MbCENJGwlTt6o3NLOISktpzEYm+Wp7ox16nQaHL4dfEe+bDF8zXrGtwY6BaWkj4eXJ+ZRXFtvR6MCJAupIGDGAJiJldrpSCHEfgAAACCF8ANTM6R8G0EVEbUSkB/AmAMH60UKIWSFEpRCiVQjRCuB5AHcJIY4k9k/JrO2N0tL+/cek5TjemJM+JVoNNtda8fDZMXi8AdVBwJXtFZicXwpuPEzE0MwirAYd7MYS7I2wkShYwq6SK3CotaXeBiFS1+wmlwkh5OYXib0/zKU6bK6zhS1BFqmEncJQokVXjRUPnByBPyD4OJWEnY0OaDUU90Y85SI89AJqT3NZ3A1Vln0BjM550BjnBsJQe1vK8HIS5fjSwecPYMi5iOby9ReYOxulFtOpqtgzPLuIhgQvZPNBqU6LXU0OHI6wkbB3yg2zXhuxs6CykVA516b6eLGz0Y6eyQXMeXIrBz9R0Wagld09C0RUAXnXCRFdBSDmJYQcaL8P0uz1WQD3CSFOE9HHiOiu5IadfVvqbCACHjwt5fVw9YX02tpgR9+UdCJSW8f2QHsFAOC5S1MJP680c7jSHUunIRxeUxHB6fZizuPjGeg47JAvQHO1rFYqTc4vw+MNJDwDDUjl7I4NOFd15AQil7ALta3eFqwCwjPQiTPqtdhab4t75lip4R16AbWnuQwBgbhm40ZmFxEQiLsLYag9zWWY8/hwMYlJhVQbdnrgDwi0lK9/H+9scgAATqSo/NnQzCIaHYUbQAPA/tZynBoOf3EmXXBH7uKr9F1QJgZTfbzYLjdUOVUgs9DRAmjlL/xXkGaOO4joGQDfBRB+i+caQogHhBAbhBAdQohPyD/7kBBiXSdDIcTBfJl9BqRZofZKM+aXfGhwGGHSZ66rUTFSrowB9WV1WipMqLcb8FxP4gH0kHMRDfIB16jXYmuDHS+tCaDDdXZi0dXZjWirNOPZJC5u8kWiNaBD7W0th3vZj7MjKzP20UrYhdq26rPD79Fk7GkOfyETjfL6N4bMsO5udgCIryyeEojH280y1D457zWXytmFK2Gn2Fpvg1ZDKakHLYTUxbHeUdjlZq9oK4c/IMK+t/qm3VFXSh0mPZrKjXj20iQA9ZNVau1oSF+HyWyIFkBXEdFfATgI4JcAPg3gdwC+DuCW9A8t9ylBHZ+U0k/pqmYz6CIuP61FRLiqowLP90wntGQpLb2vzl29oqUMxwedWPKtZDEFA2hO4YjL1R0VeKFnCt44gpF8tFIDOonAR2mo0reyNKsEHrHed8qsUrW1FNYUdwotNntayrDo9ceVejQ444bdWLKqS6vDpEd7lRlH4wmggxdiic+gtlaYUGHWR6wrnmqT80voHov+t1Lex81hUpFMeh021FhxLAUB19TCMpZ8geCESKHa0yztmVibB+0PCAxMh68BHWp7gx0BAVhKdaprUKtVZtajudyUshWFbIsWQGshbRq1AjBDqtihBWCSf1b0lJkdXhZNvw21Fug0hPYqS8Tlp3AOtFdgemEZF8bjz7WdXfRifsm3KoDe11qGJV8Ap4ZWyiD1TrpBlFyAVIyu7qjEwrK/oDaVhJNo97hQ9Q4j6uyGVXnQSvOeWCsfm+ts0MTRwIFFtifBmeNwQe/upjK83K++ocrAtBs6DaHOnvj7iIiwp6Us4YYw8frsQ+fx+q8+F3XGvn/ajRItRWxEtrPRjhODiTWeCTXslC5k6ws8gLYaSrC13r6uocqwcxFev0BrjG6520ImBuM516q1o9FeMMf8aAH0iBDiY0KIj4b7ytgIc9g2noHOmFKdFrdurcGNG+OrHXygI/E86HDd4/a2KEugKwen3qkF1NuNBbuzO11WXpvJLI8kvQZnFlFu1sNcmlya196WslVL733BlY/oxx+TXoc7ttXh5s3FU3c7XRocRtTYSuNKgRiIsIF0T4sD0wvLwb0dsQzOLKLeYQy2hE7U3pYyXJ5cwOR8ajbmRdM35YbT7cWJocgBU//0AprKTBH/XTubHHC6vRG7wKo1JB/PC3kToeKK1nIc7Xdi2bdy4aK8z9TMQAOpaeEdzg55Y+hUBt5/6aYmB5pFsLelDH9+Ywfu3J66Qu8ssq+8ZS8+cEt89Usby0xoKjcmGUCvnPyqrKVoqTCtWgLtnYqeV8bCKzfrsbnOhmcuFnYetFSBI/mT9r6WMozMejAkz6TFKmEX6stv2YM/ua496TEUOyLCnmb1M7hCCGnjWpjXP96yeAMz7qRWMRTBakIZyIMenfUAAJ7ujnyR3D/tjro/YKe88exYknnQyuem0VH4x+r9bdJK6cmQCxe1e3W2N9hRoiVsqrOlZWw75NezEGahowXQ6vpNFrESrQZ/e9smVFpSmyfEUutAewVeuBx/HrRywF2bM7evpRwv9c0ElxS5BnTirumowEv9M/B4U9PtKheFVnJJxtoNYLFK2LH02NtShoHpRYy7PDHvO+FawpIvEDZA3FBjhaVUp7qqx8D0YsKlEEMpAVK660ELITA8Kx1Dn+qO3EG4fyp8DWjFhhoLDCWapOtBDzkXYdZrYTMW/oZ/5VgRWs6ub2oBpToNqmPkNTtMetz/vmvx9gOtaRnbtgY7iIDjBZAHHTGAFkJwb2pWEA50VGB20YszI/G1bx2ckWpmOkyrZ/j2tZZhamEZlycX4HQvw+n2cgCdoKs7K7DsC+RUVYBUCgTkGcgkNn4pNtVaYdJr8ZJ8UlRTwo6l3u5mZQbXGfO+AzPrS9gpgg1VVASyi8t+TM4vJbWBUGEo0WJbmGpCqTa76IXHG4DNoMPRfifmw5RVm5VLgEYLoHVaDbbV25PeeDY0I9WATkdeb66ptJSivcqMFy+Hphq60VJhgkZFCtDmOhuM+vSkJFpKdeisshT8DDRjBeFAeyUA4Pk4y9kpM4drD7hKRYQjfTPoDeaV8UxgIva3VUCroWDZpEIz7lrCsj+QkhlonVaDXU0OHOmbUV3CjqXetgYb9FqNqsA3XBOVULubynBu1BWzI+eQM/lSiKH2tZThxNDsqmpCqTbslGbo797VAF9A4PkwaXR901JaQax/184mB04NzyZVsWd4drHgNxCGurKtHEd6V1ZelRrQuWBvSxkO907HVQ4yF3EAzQperd2Atkpz3HnQyozFWh1VFtiNJXipdwa9ciUEbqSTGEupDjsb7QWbB60EUKnIgQakwOfsyFxwNYVz7zOvVKfFtgabqhzilQos4V+nPS0O+AMiZnrCSg3o1LyP9raUYXlNNaFUG52TxvzKHXUwlmjx9MX1F8nKxsBYExC7mhzweAM4FWUzYixDM4sFX8Iu1BWt5Zjz+HB+zIVAQKBvyh2zAkem3LChCi6PDy/3O7M9lKRwAM2KwlVyHnQ8V7yDETbtaDSEfS1lONI3jd6pBRClbmaoGF3TWYkTg86Cae8aKthEI0UlDve2liMggF8fHQLAzXuyZa/KGdzBmUVUWvQRl8N3N8npIDFms4M1oFP0PtqTgY2Eygx0a4UZV7aX48kwedDRmqiEurazEhoCHjsfOZc6GveyDzNub1HNQF8Rkgc95vJgyRfImRnoa7oqodMQHjs/nu2hJIUDaFYUDnRUYH7Jh1PD6mZc5jxSbl6kGYu9rWW4NLGAo/1O1NkMXMIuCQc6KhAQwIs9hbftYjDFM4e7mx0gAu4/LrXa5QA6O5QZ3NMxjidS5YzIwWGZWY/2SjOOxpiJG5h2o1SnSVlji2qrAc3lJhzpS99nbmR2EVoNocpaims7K9EzsRCsxawYmHajwqyHJUaJxzKzHruby/B4ggGX8ryp+hzmg8YyqXb8i5en0TspN13KkeOFzVCCvS1leDzBC6JcwQE0KwpXtUtX42rTOIZidI/bJ9eDfqp7ImYdXhbdnuYylOo0BdnWe2DGjSpracousGyGEmyssWLG7UWZqQR2E3cWzIY9zepmcKUmKjHSE5odONo/E7VRiNIRNZUb4KS64sk3KIlkZNaDGmsptBrCdV1VANaXs+ubil7CLtRNm6pxYnBWVfWTtZSSpMWUwkFEuKK1HId7p4Ml7HJpr86Nm6pxdmQuWOowH3EAzYpCtdWAzmoLnlO5kTBcE5VQOxqlUlABEbswPYvOUKLFvtaygtxIODizmLL8Z4VSx5cv3LKn2mZAY5kxauqFPyAw7AxfAzrUvpZyTC0s49LEfMT7xJrJTsTeljJMzi8F86tTbcTpQa1d6i64ocaCamspnlqTB90/7VYd1ClNtBKZtVTSSYophQMA9reVY2xuCU91T6BESzn17195PfM3jYMDaFY0DrRX4EjvtKqd3ENyzmGkrlWGEm2wY1Mbb+RK2tUdlTg36spId7RUWVjy4WyM0ogDM+pn2NTa1yoF0FzCLrv2tZThhZ5p+CPUlx+d88AXEDHzlg9ulGZnHz4TOZCI1A48GXuD1YTSk8YxOudBnRywERGu7azEMxcng1UhvP4Ahp2LUUvYhdpcZ0Wd3YDHzsUfcA053dBqCDUR2oUXqv1t0krpI2fG0VQeudtjNmyosaDObsjrNA4OoFnRONBRAfeyX1U90cGZRRhKNKgw6yPeRylWzzPQybs6iZbr2fKd53rx6i8+jQlX+KDf5w9gxOlJed6lkj7EM9DZdfPmGkwtLEechR4IbpCL/vrXO4zY1mDDQ2dGw94+5/FidtGbsg2Eig01VtgMurSkTgkhzb7XhQSs122oxPTCcrCCzLBzEQGhfgM2EeHgxmo81T25qkW1GsNOD2pthpwKIDOhs8oCh6kEy/5AzuQ/K5TX8+mL8b+euYIDaFY0rmpXH6QNOaWSR9FyDm/cWA29ToOt9elpeVpMtjfYYS1Nz8k8XQam3fAFBB4+Mxb2drUzkPFqKjfhS2/ejTdf2ZzSx2XxObixCnqtBg+dDh/4DsbYRxHq1i21ODbgDJvfO6CyUkW8tBrCTZuq8cjZsZTX43W6vVjyBYIz0IBUbQcAnpLzoPvkGvpqZ6AB4MaNVZhf8uFIb3yz5pFKkhY6qWKUMtGTeyulB5XXM42bWdOJA2hWNMrNemyqtarKg1bTfvlARwVOfeS2lOcmFiOdVoMr28vzKg96bE6aeX4wBQFUvF61ox6VltRUZGCJsRpKcHVnBR48PRZ2I97AtBtEQL0jdtrAK7bUQAjg0Nn16QmDUboZJuv2bbVwur2rOtalgtLCu96+8m+vthqwqdaKpy9KS/ZKCbt4AuhrOiuh12rwaJxpHEPORTTmUP5vJu1vk/dM5NgMNCC9niVawhN5msbBATQrKgc6KnCkdyZm/dYhFZt/AECv449QqhzoqETflDvYfCTXKbvHn700GbaGtdolfJa/bt1Si/5pN86PudbdNjiziFqbAaW62BVYNtVa0VhmDLuasdKMJfXvo+s3VKFUp4l4EZgo5bNRa1998XBtZyUO985gcdmPgWk39FoNauPISzaX6nBlezkejWPjmc8fwOicJ6c20GXS9RuqoNUQtsl7dnKJpVSH/W3leVsPms/+rKgcaK/Aki+AY1Hqri4s+TC9sFyUS37ZdE2nlGKTL2kc4y4PNtVa4fWLsBubBmcWQQTU2fl9VKhu2VINIuCh02EC3wiNmMIhIty6pRZPX5zEwtLqtt6DM4uwlOrgSEPJQpNehxs2VOHB02PBzX2pMDwbvurFtV2VWPYF8GLvNPqn3WgsN0ITZ17yTZuq0TOxgD65NFssY64l+AOiaI/nm2ptePmfXxHcNJprDm6oxoWxeQw501MNJp04gGZF5cq2ChAhahrHkDN9S+8ssg3VVlSY9XmxkdDrD2Byfhm3bq1FtbUUvz+1fgZvYMaNOpuBVykKWLXVgD3NZWFncAen3XGlXbxiSw2WfQE8eWH1cvbAtDvlNaBD3ba1FqNzHpxIok32WqOzi9BpaF2a0ZVtFdBrNXi6ewJ9U+640jcUN22Syp+pTeNQmqgU6ww0gJyuF3/jJqkKTT6Ws+MjOysqdlMJttbbogZpQ0VYdD8XaDSEAx0VeObiZNqaO6SKUnmjzm7ArVtr8Pj5CXi8q9OC1OTRs/x365YanB6eW5V6tOyT0gbiSbu4orUMDlPJujSOwZnYzViScfPmaug0FPYiMFEjTg9qwlS9MOqlmu9PdU9iYDqxALqlwoz2KrPqAJqP57mto8qCBocRj53LvzxoDqBZ0bm2swov9c1gZmE57O3KiTDVDTBYbFd3VGLctYRLE+qWZ7NldE5aoq6xleL2rXVY9PrXzRwOykvUrLDdurUWwOo0jpFZqURbYxwBok6rwU2bqnHo3HiwKoYQIq5UkEQ4THpc1V6BB0+PpuzCdWTWsy7/WXFtl1Tz3bXkSyiABoCbNlbjhZ7pdeku4QwFZ6CLqwZ0viAi3LipCs9emoy5NynXcADNis6rdtTBFxD47cmRsLcPOheh12m4ykEWXCuXusr15bzxYABtwJXt5bAbS/D7kGX8lRlInoEudG2VZmyosayq46x094u3csatW2owu+jFi3KZtumFZbiX/WmpwBHqtm21uDy5gO7xyN0Q4zEyu4i6CAH0dZ1Vwf9POIDeVI1lfwDPXIxdtWfIuYhysx4mvS6h52Lpd+PGariX/Th8OXJnz1zEATQrOlvrbeistuDXx4bC3j44I9WAjndzC0tec4UJ2xps+PWx4WwPJSqlhF2NzYASrQY3b67GobPjwS6Xygwkr2IUh1u31OLFy9PBVS1lFSvemePruqqg12mCaRwDSgm7NKZwAMBtW2pABDyYgjQOIQRGZj0RA+it9TaUyTm5zQnWJt7XWg5LqU5V9YYh+XjOcteBDik3Pt+qcXAAzYoOEeE1u+pxuHcmbMk0KXeVD7jZ8ge7G3FyaBYXx9eXBssVY3Me6DSEcpPUqfK2rbWYXfTihR5p5jCdNaBZ7rltay0CAnjkrBL4Sq2jIwWRkZhLdbi2sxIPn5FqS2eqFGK1zYDdTY5VqyiJmlGaqESoPqPRULCpSqIz63qdBtd1VeKxcxMx006GnYucvpHjTHqpPGGurzyuxQE0K0p372oAANx/fP1MJ89YZNerd9ZBQ8AvXg6/QpALRuc8qLaWBlcpru+qgrFEG6zGwDWgi8u2Bhvq7AY8pMwcT0tBm04b/yn21i01GJxZxLlRV0YvxG7fVovTw3PB926ilKoX0S4e/vRgB/7pzs0wlyaeVnHjxmqMznlwdiTyhbYQQu4qyxeyue7gxmpcmlhA/1R+9AEAOIBmRaqp3IS9LWX49dHVAbTH68fk/BLPQGdRtdWA67qq8OtjwymtTZtK43NLqAkJEIx6rVxPdxSBgMDgzCK0GoqrSQTLX1Id5xo81T2BxWU/BmfcaEwwaLt5c02wtvTAjBtlphJYkgg01bpN3gyZbFMVpYlKXZRJiK31drzr+vaknuegXP4s2rL/7KIX7mU/z0DngRs3yuXsLuTPLDQH0Kxo3b2rHufHXDg7Mhf8mbJju1iL7ueKe/Y0YMi5GNxMlWvG5jyosa4+Kd++rRbjriUcG3RiYMad8Awky0+3ba2FxxvAExcmMDCzmPDqQ5W1FLubHHj47CgGpt1pz39WtFSYsanWmnQAPRKmjXc6VFsN2N5gj1rObmUGn4/nua6t0oyWChMez6O23nx0Z0XrldvroNUQfhWymZBzV3PDrVtqYdZr8cscTeMYm/Ogxra6SsuNm6R6ug+eGpXy6HnZuKhc0SZVY/nN8WFMuJaSqpxx69ZanBqaw8mh2bRX4Ah129ZaHOmbCdY5T8TIrLQ/oCIDVYxu3FSNo/0zmI5QkjQ4IcKfxZxHRLh5Uw2e7p6MWGI213AAzYpWhaUU13dV4jchqQKJ7p5nqWXUa3H7tjo8cHJkXYOSbFtc9mPO40P1mvQMu7EEV3dW4sHTyswhv4eKSYlWg5s3VQc34iVTA/wVW2oAAE63N6PHotu31UIIrGvmEo+R2fBNVNLh1i01CAjg/06Er9ozzDWg88obrmjEsj+An788mO2hqMIBNCtqr9ndgOFZDw7LqQJDM1IL2morH3Cz7Q92N8C15AtWNsgVY3IN6HD5zbdvrUXvlBvjriVexShCt26thV++GE9m5rijyoL2KjOA+JqxJGtTrRXN5aak0jii1YBOtW0NdmxvsOMHz/eHrcYxNLMIQ4kG5WZ9RsbDkrOp1oY9zQ788MXwr2eu4QCaFbVXbKmBSa/Fr+S6w4Mzi6h3GDMye8KiO9BRgRpbKX51NLfSOMZCmqis9Qq5ni7AFTiK0fUbKlGqk06ryeYu37pF2tSXyVriRITbt9Xi2UuTmPN4E3qMkVlP1A2EqfbWq5pxfsyFI33rm3AMz0rHcyI+nueLN1/Zgp6JBTzfk5v7X0JxAM2Kmkmvw61bavDAyREs+wJyySMOfHKBVkN4za4GPH5+AlPziedkptqYS2misj7Hs8pain0tZQA4j74YmfS6YDOUqiRzgN+wrxH728qxs9GRmsGpdNvWWnj9Ao9F2ZwXSawmKunw6p31sBp0+MHzfetu45Kk+edVO+pgM+jwoxf7sz2UmDiAZkXv7t0NmF304vHz41L5Kc5/zhl/sKcBvoDA/50I33Y9G8bkMl01EYKEV26vg05DaK0wZ3JYLEf8452b8KV7dyfdybS9yoL73nMAZRlOP9jd5ECNrRS/TGDlZ3phGcu+QEYDaJNeh9fuacQDJ0fXXWgPObkpVr4xlGhxz55G/P7UaMTNobmCA2hW9K7trES5WY+fvjTIuas5ZlOtDZtqrfhFDqVxjM15YCzRwhqhNu/bDrTioQ9ejypr+qsQsNzTXmXBrXJN5Xyk0RDu3d+Mx89P4OL4fFy/O6LUgM5gAA0Ab76yGcv+AH720srmM6mm/zLqI3REZLlr5fUcyPZQokprAE1EtxPReSK6SER/H+b2vyKiM0R0gogOEVFLOsfDWDglWg1etaNObp/LNaBzzT17GnB8wImeifhO5uky5lpCja00Yl6lVkNor7JkeFSMpc5br2qBXqfBt565HNfvrQTQmT2GbqixYn9bOX74Yn+wotIw1/TPWxtqrNjXUoYfvTiQ05sJ0xZAE5EWwJcB3AFgC4B7iWjLmrsdBbBPCLEDwM8AfDpd42EsGqW1N8Al7HLN3bsaoCHkzGbCsVnPuhJ2jBWSSksp7tndgJ+/NBjXMrrSRKUuC2Xj3nJlM/qm3Hj64iQAYNgpBfP1nAOdl+7d34zLkwt4rmcq20OJKJ0z0PsBXBRC9AghlgH8GMDdoXcQQjwmhFAanz8PoDGN42Esoj3NDjTLu+Z500luqbEZcE1nJX55bCgnZiPGXB5u0c0K3h9d24YlXyDs5rxIRmY9KNESKs2ZT1+6fVstKsx6fF8e75BTCi34eJ6fXrmjDnZjCX74Qu5uJkxnAN0AIDSBZVD+WSR/DOB34W4goncT0REiOjIxkT9tHln+ICK88YomOEwlqM1w/h6L7TW7GjAwvRi2VFUmCSHCdiFkrNBsqLHihg1V+O7zfVjyqWtmNOJcRI3NkPQGykSU6rR4/b4mPHJ2DCOzixhyeqAh8PE8T0mbCRvw4OlRTOZQFaZQObGJkIjeCmAfgP8Id7sQ4mtCiH1CiH1VVVWZHRwrGn96Qwee+NsbUaLNiY8FC3H7Nqm197ef7c3qOOY8Pni8gbA1oBkrNH9yXRsmXEv4zXF1VXAyXcJurTfvb4YA8OMXBzA0IwXzfDzPX2/e3wyvX+DnL+VmZ8J0vrOGADSFfN8o/2wVIroFwD8BuEsIkZuXGawoaDQEu7Ek28NgYZhLdXjnNW144OQIzo3OZW0cShMVzoFmxeDazkpsrLHiG0/1qEqfkgLo7KVMNFeYcH1XFX58uB/90wucvpHnumqsuKK1DD8K2RyaS9IZQB8G0EVEbUSkB/AmAPeH3oGIdgP4KqTgOf6q7YyxovEn17XBotfh8490Z20M0dp4M1ZoiAh/fG0bzo268Oyl6Ju5hBAYzfIMNCBVEBmbW8Lh3hneQFgA3nxlM3qn3Dm5mTBtAbQQwgfgfQAeBHAWwH1CiNNE9DEiuku+238AsAD4KREdI6L7IzwcY6zIOUx6vPPaNvzu1CjODGdnFnpsLnIXQsYK0V276lFp0eObT0cvaTe1sIxlf2abqIRz06Zq1Mtj4BJ2+e+Obbm7mTCtyUFCiAeEEBuEEB1CiE/IP/uQEOJ++f9vEULUCCF2yV93RX9Exlgx++Nr22A16PBfj1zIyvMHUzisPAPNioOhRIu3XdWKR8+NR22sMirXgK7NcuMSrYbwpv3NALiEXSEwlGjx2j2NeOjMKJzu3OpMyNn1jLG8YTeW4E+ubcdDZ8Zwamg2488/NueBzaCDUa/N+HMzli1vvao5ZmMVpXFJfRZqQK917/5m7G8tx4H28mwPhaXAu65vwwN/cR0cpsy2tY+FA2jGWF5557WtsGVpFnpszsNlsVjRqbCU4rV7ojdWGZ3LThfCcKqspbjvvQfQWW3N9lBYCtTZjeiqyb3XkgNoxlhesRlK8O7r2/HI2XGcGHRm9LnH5pa4hB0rSn90jdRY5fsRGqsMO6UmKhXm3JolZCxdOIBmjOWdt1/dCoepBJ97OLOz0ONzHs5/ZkWpq8aKWzZX42tP9gT3AoQamV1ErT07TVQYywYOoBljeccqz0I/dn4CR/sz050wEBAYdy1xBQ5WtP7lVVvg9Qfwsd+cWXfbyKwHdbbsp28wlikcQDPG8tLbD7Si3KzH5zJUF3pqYRm+gOAcaFa0WirMeP9NnfjtyRE8dn5164aR2UXU5cAGQsYyhQNoxlheMpfq8J7r2/HkhQkc6Z1O+/NxCTvGgHdd346OKjP+5VensLjsByCtzozNLvHFJSsqHEAzxvLW2w60oNJSig/9+jSWfP60Pte4SwqgOYWDFbNSnRaf+IPtGJxZxBcelVZ/lCYq9TlQgYOxTOEAmjGWt0x6Hf7tnu04MzKH/3wovRsKR2elLoQ8y8aK3VXtFXjd3kZ8/ckenB91hTRR4c8GKx4cQDPG8totW2rw5iub8bWnevDspcm0Pc/YnAdEQKWFZ6AZ+8c7N8Nq0OGffnkSQ0oTFZ6BZkWEA2jGWN7751duRluFGX9933HMur1peY5xlwcV5lKUaPmwyVi5WY9/uHMzjvTN4L8fvwgAvImQFRU+EzDG8p5Jr8N/vWkXJlxL+KdfnYQQIuXPITVR4dlnxhSv39uI/a3lOD44C71Wg/Ica7XMWDpxAM0YKwg7Gh344Cs24P9OjOBXx4ZS/vijsx7UchdCxoKICJ/4g20o0RI3UWFFhwNoxljBeO8NHbiitQwf+tVpDEy7U/rY4y4PqjmAZmyVrhorPnb3Nrzj6tZsD4WxjOIAmjFWMLQawn++YRcA4K/uOwZ/IDWpHF5/AJPzy5zCwVgY9+5vxh9d25btYTCWURxAM8YKSlO5CR97zVYc7p3Bfz2SmtJ2Ey6phF0Nz0AzxhgDoMv2ABhjLNVes6sBz16cwhcfvQhzqQ7vvaEjqccblbsQcg40Y4wxgANoxlgBIiJ86p7t8PgC+LffnYNOQ/iT69oTfrxxpY03p3AwxhgDB9CMsQKl02rwuTfsRCAg8K+/PQuthvDOaxLL0xyb4xQOxhhjKziAZowVLJ1Wg/960y74AwIf/c0ZaDWEPzzQGvfjjM55UKIlrnPLGGMMAG8iZIwVuBKtBl+4dzdesaUGH/r1aXz/+b64H2NszoNqK9e5ZYwxJuEAmjFW8PQ6Db785j24eVM1/vlXp/CdZ3vj6lY4PrfE+c+MMcaCOIBmjBUFvU6Dr7xVCqI/fP9pvP9HRzG76FX1u2NzHtRYOf+ZMcaYhANoxljRKNVp8fU/3Ie/u30jfndqFK/8wlN4qW8m5u+Nznm4iQpjjLEgDqAZY0VFoyH82cFO/PS9B0AEvOGrz+HLj12M2LXQveyDy+NDjZ1noBljjEk4gGaMFaU9zWX47V9chzu31+E/HjyPt37jBYzOetbdb1wpYccpHIwxxmQcQDPGipbNUIIvvGkXPv26HTg24MTNn30c//nwBcx5VnKjx+QmKlwDmjHGmIIDaMZYUSMivGFfEx74wHW4YWMVvnCoG9d/+jH8zxOXsLjsX2njbeccaMYYYxJupMIYYwDaKs34ylv24tTQLD7z0Hn82+/O4ZtPX8aGGgsAoJpnoBljjMl4BpoxxkJsa7Dj2+/cj5++9wDaKs145uIUTHotrKU838AYY0zCZwTGGAvjitZy/OTdV+GZi1PweP0g4i6EjDHGJBxAM8ZYBESEa7sqsz0MxhhjOYZTOBhjjDHGGIsDB9CMMcYYY4zFgQNoxhhjjDHG4pDWAJqIbiei80R0kYj+PsztpUT0E/n2F4ioNZ3jYYwxxhhjLFlpC6CJSAvgywDuALAFwL1EtGXN3f4YwIwQohPA5wD8e7rGwxhjjDHGWCqkcwZ6P4CLQogeIcQygB8DuHvNfe4G8B35/38G4GbiWlGMMcYYYyyHpbOMXQOAgZDvBwFcGek+QggfEc0CqAAwGXonIno3gHfL384T0fm0jDi2SqwZGys4/BoXB36diwO/zoWPX+PikM3XuSXcD/OiDrQQ4msAvpbtcRDRESHEvmyPg6UPv8bFgV/n4sCvc+Hj17g45OLrnM4UjiEATSHfN8o/C3sfItIBsAOYSuOYGGOMMcYYS0o6A+jDALqIqI2I9ADeBOD+Nfe5H8Db5f9/HYBHhRAijWNijDHGGGMsKWlL4ZBzmt8H4EEAWgDfEkKcJqKPATgihLgfwDcBfI+ILgKYhhRk57Ksp5GwtOPXuDjw61wc+HUufPwaF4ece52JJ3wZY4wxxhhTjzsRMsYYY4wxFgcOoBljjDHGGIsDB9AqxGpJzvITETUR0WNEdIaIThPRB+SflxPRw0TULf+3LNtjZckhIi0RHSWi/5O/byOiF+TP9E/kjc4sjxGRg4h+RkTniOgsER3gz3LhIaIPysfrU0T0IyIy8Oc5/xHRt4honIhOhfws7OeXJF+QX+8TRLQnG2PmADoGlS3JWX7yAfhrIcQWAFcB+HP5tf17AIeEEF0ADsnfs/z2AQBnQ77/dwCfE0J0ApgB8MdZGRVLpc8D+L0QYhOAnZBeb/4sFxAiagDwFwD2CSG2QSpQ8Cbw57kQfBvA7Wt+FunzeweALvnr3QD+O0NjXIUD6NjUtCRneUgIMSKEeFn+fxekE24DVreY/w6A12RlgCwliKgRwCsBfEP+ngDcBOBn8l34Nc5zRGQHcD2kyk4QQiwLIZzgz3Ih0gEwyr0jTABGwJ/nvCeEeBJSNbZQkT6/dwP4rpA8D8BBRHUZGWgIDqBjC9eSvCFLY2FpQkStAHYDeAFAjRBiRL5pFEBNtsbFUuK/APwdgID8fQUApxDCJ3/Pn+n81wZgAsD/yqk63yAiM/izXFCEEEMAPgOgH1LgPAvgJfDnuVBF+vzmRFzGATQrekRkAfBzAH8phJgLvU1u7MO1HvMUEb0KwLgQ4qVsj4WllQ7AHgD/LYTYDWABa9I1+LOc/+Qc2LshXTDVAzBj/bI/K0C5+PnlADo2NS3JWZ4iohJIwfMPhBC/kH88piwHyf8dz9b4WNKuAXAXEfVCSr+6CVKurENeAgb4M10IBgEMCiFekL//GaSAmj/LheUWAJeFEBNCCC+AX0D6jPPnuTBF+vzmRFzGAXRsalqSszwk58J+E8BZIcR/htwU2mL+7QB+nemxsdQQQvyDEKJRCNEK6bP7qBDiLQAeA/A6+W78Guc5IcQogAEi2ij/6GYAZ8Cf5ULTD+AqIjLJx2/ldebPc2GK9Pm9H8AfytU4rgIwG5LqkTHciVAFIroTUh6l0pL8E9kdEUsFIroWwFMATmIlP/YfIeVB3wegGUAfgDcIIdZubmB5hogOAvgbIcSriKgd0ox0OYCjAN4qhFjK4vBYkohoF6SNonoAPQDeCWmSiD/LBYSIPgrgjZCqKB0F8CeQ8l/585zHiOhHAA4CqAQwBuDDAH6FMJ9f+eLpS5DSd9wA3imEOJLxMXMAzRhjjDHGmHqcwsEYY4wxxlgcOIBmjDHGGGMsDhxAM8YYY4wxFgcOoBljjDHGGIsDB9CMMcYYY4zFgQNoxhjLEURUQUTH5K9RIhqS/3+eiL6Sxuc9SERXp+vxGWOs0Ohi34UxxlgmCCGmAOwCACL6CIB5IcRnMvDUBwHMA3g2A8/FGGN5j2egGWMsx8kzxP8n//9HiOg7RPQUEfUR0T1E9GkiOklEv5fb04OI9hLRE0T0EhE9GNIS9y+I6AwRnSCiHxNRK4D3AvigPNt9HRFVEdHPieiw/HVNyHN/j4ieI6JuInpXlv4kjDGWVTwDzRhj+acDwI0AtgB4DsBrhRB/R0S/BPBKIvotgC8CuFsIMUFEbwTwCQB/BODvAbQJIZaIyCGEcBLR/yBktpuIfgjgc0KIp4moGcCDADbLz70DwFUAzACOEtFvhRDDGfuXM8ZYDuAAmjHG8s/vhBBeIjoJQAvg9/LPTwJoBbARwDYAD0tdb6EFMCLf5wSAHxDRryC1yg3nFgBb5N8FABsRWeT//7UQYhHAIhE9BmB/lMdhjLGCxAE0Y4zlnyUAEEIEiMgrhBDyzwOQjusE4LQQ4kCY330lgOsBvBrAPxHR9jD30QC4SgjhCf2hHFCLNfdd+z1jjBU8zoFmjLHCcx5AFREdAAAiKiGirUSkAdAkhHgMwP8DYAdgAeACYA35/YcAvF/5hoh2hdx2NxEZiKgC0ubDw+n8hzDGWC7iAJoxxgqMEGIZwOsA/DsRHQdwDMDVkFI5vi+nfhwF8AUhhBPAbwD8gbKJEMBfANgnbzQ8A2mToeIEgMcAPA/g45z/zBgrRrSy8scYY4xFluHSeowxlrN4BpoxxhhjjLE48Aw0Y4wxxhhjceAZaMYYY4wxxuLAATRjjDHGGGNx4ACaMcYYY4yxOHAAzRhjjDHGWBw4gGaMMcYYYywO/x8Qzx3zaTi98AAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
          " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "### Initialize\n", + "environment.reset()\n", + "\n", + "## Creation of the environment via Environment.create() creates\n", + "## a wrapper class around the original Environment defined here.\n", + "## That wrapper mainly keeps track of the number of timesteps.\n", + "## In order to alter the attributes of your instance of the original\n", + "## class, like to set the initial temp to a custom value, like here,\n", + "## you need to access the `environment` member of this wrapped class.\n", + "## That is why you see the way to set the current_temp like below.\n", + "environment.current_temp = np.array([0.5])\n", + "states = environment.current_temp\n", + "\n", + "internals = agent.initial_internals()\n", + "terminal = False\n", + "\n", + "### Run an episode\n", + "temp = [environment.current_temp[0]]\n", + "while not terminal:\n", + " actions, internals = agent.act(states=states, internals=internals, independent=True)\n", + " states, terminal, reward = environment.execute(actions=actions)\n", + " temp += [states[0]]\n", + " \n", + "### Plot the run\n", + "plt.figure(figsize=(12, 4))\n", + "ax=plt.subplot()\n", + "ax.set_ylim([0.0, 1.0])\n", + "plt.plot(range(len(temp)), temp)\n", + "plt.hlines(y=0.4, xmin=0, xmax=99, color='r')\n", + "plt.hlines(y=0.6, xmin=0, xmax=99, color='r')\n", + "plt.xlabel('Timestep')\n", + "plt.ylabel('Temperature')\n", + "plt.title('Temperature vs. Timestep')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train the agent\n", + "\n", + "Here we train the agent against episodes of interacting with the environment." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Train for 200 episodes\n", + "for _ in range(200):\n", + " states = environment.reset()\n", + " terminal = False\n", + " while not terminal:\n", + " actions = agent.act(states=states)\n", + " states, terminal, reward = environment.execute(actions=actions)\n", + " agent.observe(terminal=terminal, reward=reward)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check: Trained Agent Performance\n", + "\n", + "You can plainly see that this is toggling the heater on/off to keep the temperature within the target band!" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtAAAAEWCAYAAABPDqCoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAABjM0lEQVR4nO2deXxdV3Xvf0uzpXsl2xquPNuxJSXOAAETApSQAi0hDOlAW3jto/RRUj6vpe2jfX200JahlFKmMrWFAqWljGVIA4QMJCGEKU1CyGDHupIdz9a9kgfpXsma9/tjn33vvueeYR/bsizp9/189LHv1dY5e5+zh7XXWnstUUqBEEIIIYQQ4kbNYleAEEIIIYSQpQQFaEIIIYQQQhJAAZoQQgghhJAEUIAmhBBCCCEkARSgCSGEEEIISQAFaEIIIYQQQhJAAZoQQsiCISK/KSJ3LnY9CCHkfEIBmhCyZBCRovUzLyJnrM+/udj1OxtE5ICIvHix63G2iMhfWO9gUkTmrM+7lVKfV0r94gWu09tF5D8u5D0JISsLCtCEkCWDUiplfgAcAvAK67vPL3b9/IhI3XK4RxRKqb+13skbAfzYeieXL2bdCCFkoaAATQhZ8ohIjYi8RUT2icgJEfmKiKz1frdVRJSI/I6IHBaRUyLyRhF5log8JiKnReRj1rVeJyI/FJGPicioiOwVkRdZv28TkU+LyHEROSoifyMitb6//ZCInADwdhHZLiL3ePUaEZHPi8hqr/znAGwG8E1PY/tnInK9iBzxta+kpfa0q18Vkf8QkTEAr4uqk+866z2t/Vrru6u9etWLyA4Ruc9r94iIfPk8vJvXicgPrM9KRP63iAyISEFE3uU9ox+JyJj37hqs8i8XkZ957+lHInKV9bv/57W3ICL9IvIiEbkBwF8A+A3vmT6a4L0FvnNCCPFDAZoQshx4E4BfAvACAOsBnALwcV+ZZwPoAfAbAP4BwFsBvBjA5QB+XURe4Cu7D0AHgL8G8HVL6PwsgFkAOwBcDeAXAfyu72/3A8gAeDcAAfAer16XAdgE4O0AoJT6n6jUpP+9Y3tvAvBVAKsBfN6hTvDudwzAjwH8qvX1/wDwVaXUDIB3AbgTwBoAGwF81LE+SXkJgGcCuBbAnwH4JIDfgn42VwB4DaCFewCfAfB7ANoBfALArSLSKCJ9AP4AwLOUUmnvmgeUUrcD+FsAX/ae6dO8e34W8e8t7J0TQkgFFKAJIcuBNwJ4q1LqiFJqClpAfZXPveFdSqlJpdSdAMYBfFEplVdKHQVwP7RQZcgD+Ael1IxS6ssA+gG8TEQyAG4E8MdKqXGlVB7AhwC82vrbY0qpjyqlZpVSZ5RSg0qpu5RSU0qpYQAfhBb0z4UfK6VuUUrNA2h1qJPNF1AWUMUr9wXvdzMAtgBY7z2rHwRf4pz5e6XUmFJqN4AnANyplNqvlBoF8B2U38XNAD6hlHpAKTWnlPo3AFPQgvccgEYAO0WkXil1QCm1L+hmju8t8J2f95YTQpYFi+o7Rwgh54ktAL4hIvPWd3PQWmBDzvr/mYDPKevzUaWUsj4fhNYgbwFQD+C4lj0BaEXEYaus/X8jvH0YwPMBpL3yp5xaFY59D5c62XwNwEdFZB2AXgDz0BsIQGuD3wXgv0XkFIAPKKU+c451DSLuXXR7/98C4LdF5E3W7xugBfz7ROSPoTdLl4vIHQDe7GnZ/bg8o7B3TgghVVADTQhZDhwG8FKl1Grrp8nTLp8NG8SStKD9lI9595kC0GHdp9V3WM4WwgDtTqAAXKmUaoV2VZCI8uMAms0Hz0+301fG/huXOpX/UKlT0G4avwHtvvElIzgqpYaUUm9QSq2Hdpv4RxHZEXSdC8RhAO/2vddmpdQXvfp+QSn1c9ACsgLwXu/v/M/U5RmFvXNCCKmCAjQhZDnwzwDeLSJbAEBEOkXkpnO4XheAP/QO1v0atO/ybUqp49DC5wdEpFX04cXtPv9pP2kARQCjIrIBwP/1/T4H4BLrcxZAk4i8TETqAbwN2lUhkLOs0xcAvBbAq1B234CI/JqIbPQ+noIWROer//yC8S8A3igizxZNi/dc0iLSJyIvFJFGAJPQmmtT1xyArSJSAzg/o8B3fqEaSghZWlCAJoQsBz4M4FYAd4pIAcBPoA+FnS0PQB84HIE+CPgqpdQJ73evhXYj2AMtZH4VwLqIa70DwDMAjAL4NoCv+37/HgBv86JM/KnnB/y/AXwKwFFojfQRRJO0Trd67RtSSj1qff8sAA+ISNEr80dKqf0AICK75QLH2lZKPQTgDQA+Bt2uQQCv837dCODvoN/RELQA/Ofe7/7T+/eEiPzU+3/cM4p654QQUoFUunwRQsjKRkReB+B3PdcAsgLgOyeEJIUaaEIIIYQQQhKwYAK0iHxGRPIi8kTI70VEPiIig6KTGTxjoepCCCGEEELI+WLBXDhE5DrogzP/rpS6IuD3N0InP7gR2lfxw0qpc/FZJIQQQgghZMFZMA20Uur7AE5GFLkJWrhWSqmfAFjtxSUlhBBCCCHkomUxE6lsQGUQ+yPed8f9BUXkZuiMVGhpaXnmpZdeekEqaJiYnsW+4XFsbW9Buom5ZwghhBBCVgIPP/zwiFLKH4t/aWQiVEp9EsAnAWDXrl3qoYceuqD3P3RiAte9716861VX4dd3bbqg9yaEEEIIIYuDiBwM+n4xo3AcBWBLoxu97y46ulp1DoPhwtQi14QQQgghhCw2iylA3wrgtV40jmsBjHrZoi46mupr0dpUh/zY5GJXhRBCCCGELDIL5sIhIl8EcD2ADhE5AuCvAdQDgFLqn6FTpN4InVlqAsDvLFRdzgddrU3IjVEDTQghhBCy0lkwAVop9ZqY3ysAv79Q9z/fdKUbkS9QA00IIYQQstJhJkJHMq1NyNMHmhBCCCFkxUMB2hGtgZ7CQiWeIYQQQgghSwMK0I50phsxPTuP0TMzi10VQgghhBCyiFCAdqSrtQkA6MZBCCGEELLCoQDtSCatY0HnGYmDEEIIIWRFQwHaEaOBzjEWNCGEEELIioYCtCNdRgNNFw5CCCGEkBUNBWhHWhrrkGqsYyxoQgghhJAVDgXoBJhQdoQQQgghZOVCAToBnelG5OkDTQghhBCyoqEAnYAuZiMkhBBCCFnxUIBOQCbdiPwYsxESQgghhKxkKEAnoKu1EWdm5lCYml3sqhBCCCGEkEWCAnQCutJeNkImUyGEEEIIWbFQgE5AV6uJBc2DhIQQQgghKxUK0AkwGuhhHiQkhBBCCFmxUIBOgNFAM503IYQQQsjKhQJ0AtKNdWiqr6EPNCGEEELICoYCdAJEBBnGgiaEEEIIWdFQgE6ITudNFw5CCCGEkJUKBeiEdKWb6MJBCCGEELKCoQCdkK7WRrpwEEIIIYSsYChAJ6Qr3YTi1CzGmY2QEEIIIWRFQgE6IV1pk0yFWmhCCCGEkJUIBeiElLIRMhY0IYQQQsiKhAJ0QjKtOhshNdCEEEIIISsTCtAJoQsHIYQQQsjKhgJ0QtpW1aOhroYuHIQQQgghKxQK0AkRES+ZCjXQhBBCCCErEQrQZwGzERJCCCGErFwoQJ8FXekm5JiNkBBCCCFkRUIB+izoam2kDzQhhBBCyAqFAvRZkGltwtjkLCZn5ha7KoQQQggh5AJDAfos6PRC2Q3zICEhhBBCyIqDAvRZYGJB5+jGQQghhBCy4lhQAVpEbhCRfhEZFJG3BPx+s4jcKyKPiMhjInLjQtbnfMFshIQQQgghK5cFE6BFpBbAxwG8FMBOAK8RkZ2+Ym8D8BWl1NUAXg3gHxeqPueTUjZCaqAJIYQQQlYcC6mBvgbAoFJqv1JqGsCXANzkK6MAtHr/bwNwbAHrc95Y09yAuhqhBpoQQgghZAWykAL0BgCHrc9HvO9s3g7gt0TkCIDbALwp6EIicrOIPCQiDw0PDy9EXRNRUyPoTDcyFjQhhBBCyApksQ8RvgbAZ5VSGwHcCOBzIlJVJ6XUJ5VSu5RSuzo7Oy94JYPoam1iNkJCCCGEkBXIQgrQRwFssj5v9L6zeT2ArwCAUurHAJoAdCxgnc4bXelGhrEjhBBCCFmBLKQA/SCAHhHZJiIN0IcEb/WVOQTgRQAgIpdBC9CL76PhQFe6kWHsCCGEEEJWIAsmQCulZgH8AYA7ADwJHW1jt4i8U0Re6RX7EwBvEJFHAXwRwOuUUmqh6nQ+ybQ24dTEDKZn5xe7KoQQQggh5AJSt5AXV0rdBn040P7ur6z/7wHwvIWsw0JhQtkNF6ewYfWqRa4NIYQQQgi5UCz2IcIlS1crY0ETQgghhKxEKECfJV1pnY2QoewIIYQQQlYWFKDPEqOBHmYoO0IIIYSQFQUF6LOkvaURNQJmIySEEEIIWWFQgD5LamsEHSmGsiOEEEIIWWlQgD4HMq1N1EATQgghhKwwKECfA13pRuR5iJAQQgghZEVBAfoc6GptpAaaEEIIIWSFQQH6HOhMN+HE+BRm55iNkBBCCCFkpUAB+hzItDZCKWCkOL3YVSGEEEIIIRcICtDngEmmkmcsaEIIIYSQFQMF6HOgK62TqTAbISGEEELIyoEC9DmwfvUqAMDRUxOLXBNCCCGEEHKhoAB9DnSkGtDcUIuDJylAE0IIIYSsFChAnwMigs1rm3HoBAVoQgghhJCVAgXoc2RrewsOnBhf7GoQQgghhJALBAXoc2RLezMOnzqD+Xm12FUhhBBCCCEXAArQ58jm9mZMz85jaIyh7AghhBBCVgIUoM+RLWtbAAAH6QdNCCGEELIioAB9jmxpbwYAHKQfNCGEEELIioAC9DmyfvUq1NcKQ9kRQgghhKwQKECfI7U1go1rGMqOEEIIIWSlQAH6PLB5bTMOnqQLByGEEELISoAC9Hlga3szDo5MQCmGsiOEEEIIWe5QgD4PbG5vQWFqFqcmZha7KoQQQgghZIGhAH0e2LKWkTgIIYQQQlYKFKDPA1s7jADNg4SEEEIIIcsdCtDngY1rmiFCAZoQQgghZCVAAfo80FRfi+7WJkbiIIQQQghZATgJ0CKyRURe7P1/lYikF7ZaS4/NaxkLmhBCCCFkJRArQIvIGwB8FcAnvK82ArhlAeu0JNna3oIDFKAJIYQQQpY9Lhro3wfwPABjAKCUGgDQtZCVWopsbm/GSHEK41Ozi10VQgghhBCygLgI0FNKqWnzQUTqADBjiI8t7ToSx6GT1EITQgghhCxnXATo+0TkLwCsEpFfAPCfAL65sNVaemxZ2wKAsaAJIYQQQpY7LgL0/wMwDOBxAL8H4DYAb3O5uIjcICL9IjIoIm8JKfPrIrJHRHaLyBdcK36xsbmdsaAJIYQQQlYCdVG/FJFaALuVUpcC+JckF/b+9uMAfgHAEQAPisitSqk9VpkeAH8O4HlKqVMismR9q9tW1WNNcz0O0oWDEEIIIWRZEylAK6XmPA3yZqXUoYTXvgbAoFJqPwCIyJcA3ARgj1XmDQA+rpQ65d0vn/AeF5brr4/89eYrfhOH7hgAPvymC1MfQgghhJDlwve+t9g1cCZSgPZYA2C3iPw3gJKDr1LqlTF/twHAYevzEQDP9pXpBQAR+SGAWgBvV0rd7r+QiNwM4GYA2Lx5s0OVF4etk6fxcGr9YleDEEIIIYQsIC4C9F8u8P17AFwPHV/6+yJypVLqtF1IKfVJAJ8EgF27di1eBJCYndGWO/vxzXsHMf3de9BQxySPhBBCCCHLkVgBWil131le+yiATdbnjd53NkcAPKCUmgHwlIhkoQXqB8/ynovK5vYWzCvg6Okz2NbRstjVIYQQQgghC4BLJsKCiIx5P5MiMiciYw7XfhBAj4hsE5EGAK8GcKuvzC3Q2meISAe0S8f+JA24mDCxoA+s0FB28/PJjANPHB1NlHjmwQMnMZgvOJd/9PBpfK/f3a3+wMg4vvCAu6v/6JkZfPTuAczMzTuVn5tX+OjdAzg5Ph1f2ONzPzmI/iH3Nt/+xBB+MDDiXP7hg6fw9Z8ecS5/+OQEPnHfPijl9q7Hp2bxwTv7MTkz51R+fl7h4/cO4vjoGec6ffnBQ3j8yKhz+Xv25nD3kznn8o8fGcWX/tu9XwyNTuJj9ww4j4fJmTl88M5+FB3HglIKn7hvHw4liPjz9Z8ewcMHTzqXv39gGLc/cdy5fP9QAf/+4wPO5U8Up/AP381i1nHszMzN44N3ZTE6MeN8j3/94VMYzBedy3/rsWP40T73sfPA/hP4r5/5dULh7B8u4lP373ceO2OTM/jgnf2YnnWfXz5y9wCGC1POdfqPnxzEnmMuy7nmjt1DuC877Fz+kUOn8NWH3eeXI6cm8E/fc59fJqb1/HJm2m1+UUrPL8dOu88vX3noMB49fNq5/L39edy1x31+eeLoaKJ1Jz82iY/e7T6/TM3q+aUw6TZ2lFL4l+/vx4ERdznmlkeO4sED7vPLjwZHcNvj7vPL7U8M4S++8bjzOnKhiBWglVJppVSrUqoVwCoAvwrgHx3+bhbAHwC4A8CTAL6ilNotIu8UEeM/fQeAEyKyB8C9AP6vUurEWbZl0SklU1kGoexOjU/jX76/33mQHj45gcv+6nb89NApp/KFyRn88j/+EJ/+wVPOdXrTFx7B396217n8++7ox1u+9rhz+c/+6AD+4huP45SjgHvb48fxgbuyzhPHo0dO4wN3ZXGr46JbmJzBX97yBP71h+7P6G++vQfvv7Pfufw/fW8Qf3nLE84L1pcePIT3fGcvjpxyW4C++2QOH7lnEPc7CvUD+SLed0c/vua46E7PzuOt33gCn7zffd/93u/04723u/ejf7l/P952yxPOgszXfnoE778zi6zjZu8HAyP4yD2DzkL9kVNn8J7v7MUXH3RbdJVS+MtbnsA/3rvPqTwAfPCuLN71rSedy3/2R0/hr/5rt/Mi/c1Hj+EfvjuARx03Pg8eOImP3D2AbzsuuieKU3jHN/fgcwmE+nd8cw8+cveAc/mP3jOIt9+623nsfP6BQ/ibbz/pLODe/sQQPnLPIB54ym1J3H1sFB+8K+ss1E9Mz+Iv/+sJfCbB/PKe257E++9wn1/++b59eNstjzuvI1958DDee/teHHBcQ+/dO4yP3DOI+7JuipJ9w+N43x39+M+H3OaX2bl5vO0bT+AT33cfO++7vR/v+Y772PnMD57C225xFw6//shRfOCuLPYcd9v4/GjfCXzknkHcudttfhkam8S7b3sSX3RUGiil8Jf/9QQ+ds+gU3kA+NB3s3jnN/fEF/S4f2AY33r0GBovMtfYRLVRmlsAvMSx/G1KqV6l1Hal1Lu97/5KKXWrdb03K6V2KqWuVEp9KWkDLiY6U41obqi9KGNBD+QK+M+HDscX9PjGI0fx7tuexBPH3Ba4Rw6fxtTsPP77KTdhciBfxMycwm7H649OzGBobDKRNrY/V8DQ2CRGz7gt6gOewJPNud3DlBvIuWm5BrzyWUet2IBXzrU+41OzOHLqDAbzRedFvT9XwPj0HI46amT6h5LVyZRzLd9fKu/2jA6cGMfsvCo92zhm5uaxf6SIp0bGnS0H2VwBs/MKTzlqZLIJ29Cf8BmV+51b+aOnz2B8es5ZoFdKYSBX1H/nqBU343LAsW/3e8/GtQ3ZoaTPqFjxbxynxqcxXJhyHsuAfm+nJmYwUnTbcCftF+U2Oz7ThM9IzxPu7+DM9BwOnpzAYL7oLBBnc0VMzszj8Cm3NTHpWEg6X5TegeNYOHBiAtNz887Xn5tXGBwu4uCJCUzNugnE/bkC5hWwf9hxfimNtYRjx7F80n40NDaJwuSscz9SSqF/KOHanCuiN5OGiDiVv1C4uHD8ivXzKhH5OwCTF6BuSw4Rwea1zTh08uJz4fjU/U/hz772mLOpqyxMJp3s3QbRQELh0wz+o6fPOGm5Tk9MlzQ9rgM7qXA4kEtWvnR9x02A/YxcBGIjvBSnZnFsNH6ITkzP4vDJM6V7ONUpab9IKCgNnKUwuX/YTSA+MDKOmTmFmTnlZKKcmZsvLWxJhbfkbU72TF3Lm3d7+OQZTEzHC8THRidL7iQuArERuPW9FqjNXj1chQZTzlnI8OpzYnwaI8V4DbERuIEEm4CkfTuf8Jnmk/WLUj91FIj3DWuB+8zMnJMFanJmrpSZN2lfXej5Iun1D4yMOwnEB0+MY3p2HnPzykkgnptXJTcj57561nNwsneQ9PrHRied1uZ8YQpjk3p+cXHJVEohmy+gtzvtVJ8LiYsG+hXWz0sAFKDjOZMANq9tdjY/nQvf3ZPDrY8ecy6fzRegFJx9ArMJhcPEi4N3/QMnxp1MV/Z1XRZ1e/D3O9TplLVwLpTm0NZwuwjERuAuTM3iuINAbAvmLkL6QMJndGZ6Doe8REFJ+0W/6zMdKgvELv6xpp3Tc/OlxTry+la9Xdp88MQ4pr16uLR5dm4e+7z+6Wot6T/LsXbo5ISTQGy302URTdqPjo9OouAJ3KbPRqGUSq519+rhcn1dTpcfKU7jhINAbNfDpU7ZhP1odGIGuTEzvyRss/MGvSwcuswvph4TjhYouz+71GkwX4SRy13aPDkzVzo/5DxfnMOG28Uly1zf1QKVtB8dOjmBKa8eLvPF3Hx5s+qqiCnNwc7zUVlZ5XIuo2K+cHhvFf3IYTwPF6ZwemIGvV2p2LIXGhcB+lNKqd/xft7guWL0LHTFlipbO1pw6ORE4gN1Sfnw3QN43x1ufpxKKQx6Hdtl4lNKJR50A/nyLnfOoe3m+vNKazZiyw8VYKw3LtoD087aGnETGrzydTXiNPEZDZQu76Yh7h8qoL5WMDY5W1pMoxjI6/J2/aLI5pKXB4D6Wrc2G5Ova3kjcNfXCvYNu/WLgXwR9bWC6bl5p41oNle02uy2sRIBasS9PODe5oMntcm3vlacNnpz8wr7hnUbDp2ccLIQ2e/ZZUO80P3CLu+iRcuNTaEwOet8fTMf1dcKRopTTmcUBs6iX5jyTvNF3u4XLuWTPdPRM9plrb5WMOhqgfKekatLVuJ+kS+grsa9/EDCNu8b1gJ3fa04zfFTs3M4eELPL64uWeY9z84rp8P+Z9OPAL3uJB9r8dc/7Anc9bXi5JIx72m462vF2SVrwOoXLu8hexblgSTzi34uS1UD/VHH7wi0Bnp6dh65wsJ5ucx7i+7hk247xKGxsobIpcOaBc51EjCmuvVtTZiancdhh3Tm2VwBV25oc65TNlfEVRva0FRf47RrHcgVkG6swxXrWxNNZD/X0+GkITbln9/TgdEzM8jHHAwandBlnt/TWfH3cfcw5V0X9b7uNLrSjU6T8UC+iIa6GlyzbW2iZ3RdTycG8/ECsTH5XtfTienZeA2x6UfXldrstqg/b0cHRBwX9VwBW9tbsLW9xXmyFwGet6PD6R0MWM/ooIN1xZh8r+vphHLYTBqT73WlfuRSpyKu2bYWDXU1ztabrnQj+rrTTv765rk8v6czcT/KF6ZweiJaIDYm3+scx44x+Zb6kYOgYeajtlX1zv0o1ViHqzetSSQ0XNfT6eSSZUzb1/V0OlmgxiZncGx00ho7bu/t+Qn70Y6uFNa3NTm2WQtu117S7nx9QLfZxSVr//A45uYVruvpdHLJmpqdw4GRced+ZMo8Z3uHp4hxK795bTO2dbQ4ttlbd3Z0OPdTQD8jF5esI6fO4MzMnDUWous0P68wYM0vruvOM7eswar6Wuf33JFqwM51rU5tNsqw3swSEqBF5Dki8icAOkXkzdbP26GzBpIASqHsRhbOjePY6BlMeJoqt8m+vCt20Sibaz5vRweOj05iLMavyWgOXnbVOgDxWu7RM9qc+ZLLM84774F8AX3dafR0pZ0nmp5MSgsBjpN9urEOz+/pdDoYZASLl1+1vnS/6PL69y+7cp1TeWPyvWbbWnSmG50sB9mhAnq70ujNuLW5f6iAHZ0pXNrd6nQwKJsroKG2Br+wM4Op2fmSO0fU9QHg5U9za7Mx+b70ynUQie9HkzN6QbxqQxu2rG12a3OugJ6uFHoyKbdnmitgy9pmXLWhzcndqH9Ia7hfeuU6zDu4TJk6m2cUNz6NyfcXdmbQUFsT22a9IBZwaXcrdnSmnMd/byaN3q60k5m4P1dAZ7oR12xbi9zYVGyoOVNnM1/Ejf+k/ciYfJ/f04F0U11sm42Gu687jd5Mynns9GRS6O3W/Sh2wz2kBe4X9LkJxEZJUOoXMXUyc7zrHFyYnMHR02fwzC1rsK6tybnNvZk0ejJpJxeL7FABl3SksHNdK/bli7EuWf05reF+yeXdTi5ZVWMnpg1PjegDxzdc0a0tUDH9Ynp2Hk+NjOOK9a3Y2t6cYOyk3PtRroiNa1bh6ZvWOFmgzDVv9NaROAG33/eM4tp89LSWLV54WZdWVrnMLzk9v/Q4t7mAni6vHzkqw9a2NKAj1Rhb9kITpYFuAJCCTraStn7GALxq4au2NNnarhOoJD1I+Kf/+ahz2Bh7UXZZ4GyBOIkA/fKrzCB1m7zLgzquvP79zvWtuKQjFduGE8UpjBSnvcnbVQjQp3Z7M2knP0gjcF/qmYniJgKj4b6u102DY+p87fZ2dKQanAVuMxnHPVNj8u3xntFAvhArEA9Yk73LSflsroBLOltw6brW0ue4NjTU1uBFl2W88tHPyGyMnraxDZvWNMcuDvuHxzGv4LU5HXt9reGeKPWLgycmYgXibK5Yur6Lu1E2X8CmNc142sa2ijZFXR8AXnSZJxDHlte/v3RdKy7pbIl9B4dPTWByZt65HxmBuyeTQk8m7XRS3u5HAJza0JFqwLMvaa9oU1R5QGu40411se+5ZPL13nNcPxopTuPUxExpUY9zyTICt9msFhxcsrKe9rYv4za/ZHMFNDfU4vreLgDuc+quLWuRaW2Mn7/y5WfU47DhLk7N4ujpM6X3vG84XiDOWv1oem4eB2M23AO5ArZ1tGDnejO/xG8+62oEL7os4+SSZX5/xYY2bG2P1xAbgbvUj2I2w9Oz+sBxT0YrelwE4gGzWc2knM4oZXNFbFi9CldvXu19dhs7L+zLoLEufsNdml+609jRFS8QG4G7J5NCT1d8P9IHjsvzxUhxKjYvgtmUXIyECtBKqfuUUu8AcK1S6h3WzweVUu7BMlcY69qaUFcjiULZzczN45ZHjuKbjocCzSBrqK3BXgdhcjBfxNqWBjx3e7teEGM0RAO5ItpbGvAcb4GL2yX2ez5Ql69vw8Y1q2K1E2bi0gtWymHBLU/2fZk08oVoLZcZlD3exGdfI4jSgugJn7p8dJ2MBqoz3Yi1LQ2xmwBj8l3f1uRNNG6aSXvyjhKIjcm3r1sv0nECsTH59naXn1HcxiSbK3pWAO8ZxZUf0gJ3a1M9Nq9tjtVm9A9pk+/Wjhb0ZtKx5c0z6uvW/eKpmJPyxuRr2hx3Un5qdg5PjYyjL5NGn+PGKutp6bZ2tKC+VpzGzqa1q9DaVK8FYodnCgA9XZ51Jaa8eae9mTR6u9M4FmNRMgK3brN+z1HC2/y8qtis2vcMrZNXfn1bE1KNdU6Lerungertju8XJZOv957jNMT+fhTnkmUE7oqx49CGvor5KL58TyaNNS0N6Eo3OvWjVfW12LhmlZMFKlvqFyn0ZVKxLlkDvvloejZaIB6f0hF++rw5275nVBt6PcFNxKEfDRWxrUPPL1va3cZObY3gks4Wp2fU72tznAXKhNQ080WcQDwzN499w8XS2LTvGdoGT5jc0t6CBkeBeMPqVWhrrseOrnirm/m9WTtdBW4zX8S5ZJmQmvbYibqHifBzMbpvAG4+0BMi8j4RuU1E7jE/C16zJUpdbQ02rlkVu9u2OXhiArPzCk8eH3M8LKIF3J3rW520sQP5ol5wTYeNE1g9zcGG1avQ3FDrpI3d1qEHdF8mflE32pUNq1ehL5PG4ZPRhxsG8pUTWVwbzP3tBStKE1jSQGXS6Ew1Yk1ztB+kLXADWpiJe6b9noZbRNDXnY49KZ8dKqDFe0a9mXTsSXmzwBotGhC9aTBauV6rfJSGpayBSqOlsQ4b16yK9Y812lsATtrPgZw2+dbX1qA3k8KBkeiT8kYDtbW9BT2ZFOZiTsoPVGj14/vFUyNa4O7JpLC1vaV0YDQMY/Ltzeg2XNLh1ubeLvOMHDZWeW3ybWmsQ28mHRs6yrzTHs8lQ98z/B6lza2nRbO/C+Loae1j2ZtJY8PqVWhpqI1ssz7QXCjFdHUx++p+pIV504+ixo5t8u3NpDB6ZiYyeUm2JDSknDbQZWHS6kcR5U8Up3BifBo9mRTWePVy0ZaaqAN6Ax1vBezJpFBTI+jNpGNdsrK5Iprqa7BpTTN6MulYl6yBXKVWX38XXqdBq98ZgTiqzSakZm9XGk31tdiytjm+zflkc3A2V8DW9mY01tXq+SVGIB7IFVAjKAnccQKx3Y96HfrRwRM6pGZvJoUta5vRUFsT+UxnvZCavZk0amsEOzpTTv2oPHbirTEDuSLWtTWhtakevZm0dsmKsEBVzBdJ1h3HfmQi/CxlAfrzAPYC2AbgHQAOQKfpJiFsbm9xCqllMGbhUxPxh9EAYHC4iO2eQBynXTHCnvbXi9cQmYgdvZk0amrEybxnC0o9mTT2jxQjD4AY7Yq5PhA9MfUPFZBuqkOmtdGpDVlrgcu0NqI1xg/S3kXrRT1akClpoLy6a4E42uw7kCuWBJieTCr2pHw2V8QOrz4uk7G9KXEpbyatvu40Uo112LB6VeQzsjVQgH5WUROfEbj7rMk7LnSUHeuzrzsdGzoqmyvikk5v49YdP3kbgfuSjhS2dbTERlwx1+rrTqOhrgaXdEYfPCxpoLy69HanIxd1Y/K12xwXOmrA02QCsDYB0W3esHoVUo11pXpFtcEeO0Ygjut3pi4uY6ekgaroR9HWIX+b484o2CbfPodFPZsrYk1zPTpTjY7ly/PF2pJA7NaP9L/RG6uTXkjNUj/ynlGUQNxvbehdLFADee2HWlMjTm4l2VxBC9xrm61NhsMz6k5jVUMtNq9tjhwLZv43Vo+4zaSJ8GPPwXEuWQPeIWtAj8245CXZXAFbO1rQVF9btsZEKW5yRdQIsL3T0xDHuGTZltW62hpsj3GZMEldKtedaIF733CxYuzEuWRlff0IiJ4vBnIFdLc2oW1VvXM/ArTiZl1bU6xLVta37lxsuAjQ7UqpTwOY8dw6/heAFy5wvZY0W9ubcfDEhHM2ONuv8smY9JxmQTEm3JPj0xiO8O/NF3REDVeTqdnxGTN9X4yG6Mz0HA6fmigJh33dqdgT0bZ2pc/BdDWQK5aEW9OGSCEgX8Tq5np0phs9ATR6kbaFBvNvVCQOvzDZk0lHJi8Z8TRQJUGpNDFF18kIn2aTEfWM7E1Juqke62MOBhmT74bVqyraHHV9U87Uad9w+EbJPKMea/KOCh1lTL6mXxjtp0ubAWBbR4uOGhOxCegfKmKrZylpqKvBto6WSNO4Mflu62gptSWqPv0l9wqvzV2pSOuKEbhLz7Qr2mXCmHxtrb6pZ1SdTLkNq1dhVX1t7DNd39aEdFM9amoEO2I20GWTb+XYibq+XfeeTDoyeYkRuO1+ZF/Hj9/km2TsiAjaU41oj3HJ6s8V0bZKzy+mLVFua34hwLhwhQnE2aqxk4pMXmJCapafqf43akNsXNAAYIeDS1Z/roAdXSnU1giaG+qwae2q2GfaWFeDzWuby22OqY+ue/k9R7lkmZCadj+KcskyMaZ7utz6kf5dWemxpT3eJSs7pCP8NNXXagtUjEtW/5DWcO/ossdOgn6USUW6ZB08OYHp2fmq+SJsfjERfqr6Ucz4N+XKAnF0+UxrI9qa60sWqLh+ZNf9YsNFgDZv57iIvExErgawdgHrtOTZvLYZhclZnI7xNTbsy4+jtakOAGJ9moeLOqRTT5d14C1qUHsdUJvRtDYzUtNomd4APVijDuH5J7I4n2OjXTHlNq9tRmNduOnKhKQy9XEadF40CpP2syeT9hLJhC1YZYEb0AJuYXIWQ2PBAnFpUHf7tVzBdQoSGoDwicmYfM0zam2qx7q2pljTux1ovrc7WoNjm3x13dKRyUtsk69py8ycCrW0mLr2WZO9vk5wmwd9/e6SzhbUSPhkPzE9W7Fxa6yrxdb26Egc2uRrPaMY07ht8gW01iQqdJRt8rXbEmZdKQlKvkU97D3bJl8A2LSmGU31NaHv2Tb5AvAsPqnYfmTqrdscvajbJl/ThiiB2Db56vLR/cI2+eq/iy5fUgB45TtSDVjTXB89v+Qq+0XcuQxzCMrML72ZNAZz4Yd2s7kCWpvq0GXml+40zsyEW6AGQuaLuPnFtgIC4ZaJ0xPTyBemSmPTxSXLtqABeixEWzKK2N6pBW7TlqciXLIG8kU01NZgixG4Y1yygjYZ+jrh84tep3T5sktWcHkTUtNc18Uly7g+GuKsMQN5HfKuqd6bXzLpyEy72ZwOqVkSuGNcsvz9KG5tNhF+zDMtu2QFly8L3OW1eUfMBtrvz9ybiXZnNCE1Vzc3hF5zMXERoP9GRNoA/AmAPwXwKQD/Z0FrtcTZ4kXicAnUDmgN9JUb27C+rQl7YzTQJiHKjq7ywaa9Q+F/Yzp/T0lDnE6kXY0bdP6JbHtnCjURIchsPzFAh9bThxuCr1/KQmQLPl3hGmXbZcXQl0nh9ES4H6Q5UW8L3FFt7jcCd6qsgQLCNTh+IaBtVX3kSfmsr7z5f1h5v8nXlI8KHWWbfE35qOQl2VzZ5GvXLUwj0+9poDZ5C6LpF2HPyPQX04am+lps7WgJ3eyZBdGYVs3fhj0jv8nXtCHqpHw2V6h4puVDdeFtNiZfuy2hY8Gngdq0tjkydJR51hUCccTJd7/J1/xt2PVNFsXKNqcjT8qb0Gb29YEIYW+obPIFEHvArN8nBMSdUSj1I2tRj2rz0NgkCpOzpfLmb8NcskwWRX+bo1yyTD8qC9zRGuL+nHZZ625tqiwf9kzzlZvVOJesoPkl6uyKifBjJ7Lo7Y52yaoeO9EuWf1DBWzvSqGutqZU3nwfdv2G2hps9cLGXtKRQl1EmNayS4l+lsYlK2pDP69Q1eawd2BCalb2o1SkS1bY2Anb+GRzWuBe1VA5v4SOBS+kpplf4s40mWdnjx0dai64vBG4/WMnbN00EX78bT41MRNqRc/6xtrFRqQALSK1AHqUUqNKqSeUUj+vlHqmUurWC1S/JYmJBR0XJxfQE/K+Yb1bv3Rda6wGenDYaHBSaE81oiPVEKNRLmBNcz06UnoH59JhO1INWNuiy8cNUv9E1lRfq0MEhQqTlYKSqVPYzj7r02QCuu1hWi6TdMF/fftaNkECd2/Mom4OfpkFcXVzQ2TykmyugLZV9SUNlLlHvMbabkP4SXn/JgbQ7gBhoaNOT1SafO17hb+Hyme0o8sTiGPKGw1UqV9EaEtsky/gbZRCF5NKTaZucxoHTwb7Qfo1ULrN4aGjJmfmcPDkRGnjad8rSltqa+nirCvZXBFb2ssCt9lMRj1TW+DWdQov798MmzYPh5yUN1kUe7oqtWhhbTZZFIP7Uch782npOtONOnlJhNBga6Di/Kz9WjdTp7BsfoH9KMIlyySZ8j9TIFj7qeeXYtX1AYRquU1UEzO/GJessH5kQmqua2uqqFPs/NJdWaewsyvBzzQV6pI1NjmD46OTldrYrrixU2kFMC5Zof0opyP8GIG7oa4GWyOSl5ikLka5Zdoc2o/yAWOnK4Ujp4JdsuyQmvb1Tdv8TM3O4cAJ/4Y+2sUimytWzEfGJSv0PXshNZsbtHVbb7jj5wt7funNpEK1+n5lmP5/uqTQ8WOH1CxfP3y+mPfSltvXv9iIFKCVUnMAXnOB6rJs2Ly22TmF9HBRT8jbO7VLxmC+GBmKayCnE37Y5sA4/+EeS9gra3zCJxp7kHZ5C1yURtmeyIBoE6hJWGK0K4AeRMdHgw83BAmHUUK93w8VQGSIILMg2gJ31MGgksm3u3JQxwnEtsnXlA87KZ/NlQ9NGqJOygctcFEHxoI0UKXQUQHlTVIXexPTVF+LLe0toZOrX5g094vqF8bHslS+Ox2azW8gV0BDXdnka9ocJhAHbkoi+lFZw10uv2Vtc2g2P+NjaQslZYE4XJj0+/ZF+esP5AsVAjeAUljHIIHYpC2vXBDDN5NBm9uog0RGA2U/U3NoN0zgHswXK/qRiEQeSB3IVWrETZ3CrGhBJt/e7nRo8pKgTUbU/BLUj6IsVsMFHcXAbnOUS1Y5Tq5v7ES4ZBl/5or5pTvcJcsOqVluc7hLVphFTP+u+hn53bcA7dYUls2vYEJqWuUb62q9bH7h80VQvwifjwrY3qmj49jlD5+aCHTJMgL3VkvgNmM7aH4x9w0eO9XlTYQfe76IcskyWRRti5uOuBLukhXYjyI2Ddl8EZvW6gg/dvkwF07/ORe7zcFjJ6AfdYe7ZJkIP32+NlxMuLhw/FBEPiYizxeRZ5ifBa/ZEqapvhY7OlPYfWw0tuy+vJ6wtnemcNm6VszOq9J3QQzmi9hhTZZ9GZ2qOkgzGejOECFMKlV5iABAyW86TBtrtCU2fZk0DowECz6mPpXCZMprW8Bk7NOg6/LhGuKgQwcdKR2rOWjyLh2C8gl7fd2pQK1YbkxruP1t7vE0xH6B2Giggp5R2El5fYAwXfGMzCQSZG3wm3yBstAU5GIRlBrVnJQPmoyzAdoYQGu5g+pjJ3WxMaHpwvpF9WSfCs3m1+8tiPbGLcqf1m8pAVAKHRUtKJX7UV1tDbaHZPMz2TiDBOKg6xuTb9ACF3ZSvn+oUKEdNuV1fYM3DbbJ1y4fNP5NX7EF7kxro87mF7FZtdtgXCaC2nz4pNFAVY+d/qFqgbiU1KVqI5YKPaMQ2I/MWAhpg5kfyuWTzS9tq+rR3doUWN5/yNIQZhofLk55EX6q3/PgcLUFyh9S025DmEuWORBozy+lQ7sB80U2Vw6paYhyyQraZOgNd3As+CDBSn8O1pbaITVtejKpUJcsfdit+vqhG+6hcmjWcvmosVMOqWnY5FmgosdOpUAc5pJlJ3WpbHOwAq0U4SegH4W5ZJmzQ/7yQPD80p8rliL8lMuHuzMGKcM6U41YHeKS5T9YejHiIkA/HcDlAN4J4APez/sXsE7LgsvXt2L3sWh/ZqAcgWN7VwsuW6c7SqRPc76IHZ3lQXFptyeIBWgmhwvlA4cG4/YR1MGPjU6iaB3AMZgF0b/AjVvZqWxM1jb/iejQyT4TPnkbPzF7su+KMPsO5IroSDWg3Zf2szfk4GGQ9hbQC8pArvpgkP/gl6Evkw48KZ/3NFBBk71pn02QyRcoCzVhGmX/M2puqAsNHRVk8jVtCl7ggoWA3kwaB05MVFlMyppMt34RZPI11weCTeP6MEpleXNSPkyY9FtK6mrD/SCDTL66TsEHifx+7uU2p3A84KR8kMnXXF9fr/IeQSZfc33TvqA2+/upOSkf2I/yOqmLMfkCtkAcrrHe0VX9noOy+UX1o7HJ2aoQnkEmX3N9fb3KOoWZfKPck7L56n7U1lwf6pKVzemkLv75JczqFiYc9mVSgQJxaD/qSunkJT4NsR3D3iaqzSaqkU2US1Y2VyiF1DQYC1TYM7Ij/JTqFHJ2JWoODnLJKmk+AzaTQQLx+NQsjpw6U3HIGoi2HNgH1w1RLlnZXLFK4I5yyRrIFSsi/JTrFFzeTj5W2WbtknXKJxCXI/y4zRczc/PYP1K97kTPwdUWtCiXrGyuUCVwiwh6QxKL2dl4L1ZiBWjP79n/wzB2Mexc34p8YSoygD+gBejmhlp0tzZhq5ddKMwP+vSE9i0K0igH/Y2drtWmN5PGXkfzpLnHWEC6Wn/EDn+d/IPUHz/ZEHa4ISwLkdGKh2mU/ZOMaVOQH6TWQAUJ3MHJS8LC6oRF1ggTGsJOypdNvpXlWxp16Cj/xBRm8jV1DNSKBZh8AS3wHggIHWUndam4fndw6KhwrX5wvygJ3L42bA0JHVWYnAnUQNV7GuKwBSjoGfWFmMazASZfQPeLoNBRJhvnVp/AHeYCkQ1pc5iWy86iaBMWq9kkdfFvYkpRbII0REOFQHNp2AbaZFG0Tb66TanAbH5BGqiKNvvqVNLSdQeX9/ftI6eCTb5rWhrQGZDNTwvcwWMn7EBqf1g/8lxv/AJxdqicRdGmJ5MOFIiDtPqmPoD72AlzyRqxkrrYlAXi4HXBPx/pOoaNNS1Y1dRUzi+93cHZ/PpzhYoIP3abgwTi8oFAt7FTWgd95Y1Llr8NdhZFm6gD79qtL7hfBPcjreE2EX7s8rmx6ky7dhbFoDb775G0Hx0Y0RF+/PNFWB6FUhZFX5uNS1bYuhMkDPd263WqasM9VA6pebESK0CLSEZEPi0i3/E+7xSR1y981ZY2V2xoA4BYN459w+PY3qmFmTovA1tYLOhSqK+Kg03haU/NINkR6GdZrV0N0wSETUxhAndJ8AmZ7P3lwxK2DI2ZLETVg86YQO1BF+SCYpcP8oPM5ouBAndfiG+WOWQZpOEOLl/tDwiEn5QPcq8o3SMglmqYyddcIyh01EA+WAjozQSflDcacb/AHdbmgVwxUOAOCx0VpqUzsZr9wmfYxhBAYD8qhlhKzDWCTspnA0y+9j39mjQ7G2dQeb+Qns1pk69fAxUWOips41Y+VFfZ5jCTr6mTf+NmBO7gNntRbHx+kEF+7pVtrn7Pfg2U3aaqflSa7yrbHHZGIUxAN/fwa9GOnj6DCSupi01PV/UZhXIWxeB+NDVbbQn0H5o0hCVsCXJZA6xYzb7y/iglhjCXrDDhE0DgATN9IGw6tB8FCcRBFjRTx3lVmfcAKJ/TqRK4I+bUIIF7a3twNr+wdaouZMM9WJpfgt+z//r+XAg2PZngbH5B2lu7jn5rRjZXwJb25orzD5Xlq+cjO6Smobs1OFZzmIY7LI9CKaRmYJur8yj4Q2r621CYqnbJCutHFxMuLhyfBXAHgPXe5yyAP16g+iwbdq5vBYBYN459+SK2W5380u5WPHk8WANtFhTbZGpM9f256vsM5CvDrRn6usO0q0V0BsRcDNP4ZIeqIycA4YJPmBAAwIs367aLBvQC5Df7GheUME0AULkJmJ/XC2LQYrIjJI1xmCYzLHmJ0UD5BW7druo2B0UFMASdlA8z+Zrv/Cfljf9bWHm7DqV75IM1kyZ0VNAivSNTvSCWQ0dVl28OELhNnfyLSZjWTX9XfVI+bONmf2f3VWPyDdK6hWuUg/tFmHUlyOQLhAvEA7liKYtiUJ3CBe7gNvtPypeyKIaMNVMHgzH5Bo218uHM6joFjTXjVhbUhg2rVwVqoILOKESZfI0QYAvE5YNfAc+0uzp5SSmLYmSby+/NWNCCnmmYS1aQOxZguWQFlLdj2PvbHNSPzO+q21ztkhXXj/wuWSapi2s/MvcIun4pm19A+SCBO8wlKyjCT7lO1Yfw4trsd8kKCqlZun539Xs2EX4Crx9macgH96Mwl6xsrlgRUtMgIoEHUrO56gg/dp38eRT82TUr21y9NpsIP67rzty8wuBw9UHRiw0XAbpDKfUVAPMAoJSaBTAX/Sektakem9c2R2qgJ6a1Zmy75dN82bpWjBSDXT8G83rn7Rc0+jLpYBeOnD50VK05DDaZhu2KQzU++cpg+f57+DXQ/gxeFW3orj7tGzXZB/kQZyME7iDf0nKWs+o2t60yJ+X9C2J4XMqgsEjZfHj5oJPy2aGC97yrA8cHZXmM2mQkfUZBJ+VPFKcwUqw2+QLl0FFBlokg4VPXqXpRz3r91L8gmnr6s/n1DxWxqr4WG9dUC9xBrjHRC2K1livMNQkANq6pzuY3MT1bFWPaEBY6KkxoMHWq2kzmCqUsin6Cwjpmc8EmX3396g1x+cBOsPXGLgOUTb5B84U5lGdf32igwkJSBfnf29nygsr7rWhRJt/eTPUZhfKhyeDNKlC54Y7qRz1d1f0o7EwJUHbJ6vfNL9mhZP3CH1LTX95vgerPVYfUtNvsd8mKEyb9bQ5zWQOArR3VlkmT1CWoH9WHCMT+g/H+OlVr6asj/Bh6MtXJS7I5L8JPe9DYqV5HygdFgy0ZdhkgOKSmoZQt2Bo7pSyKAeXDXLKyueoDgXYb/BpireGuFrgBrdzy51HoH9JJXWzZparNzmtz9Xx08ITut37r08WGiwA9LiLtABQAiMi1AOLDS5DYg4RmotpudZLLIgLID3gCq1/QuLS7OuqFOYwWtDgEBebXJ96D3RkAT+MTMHmH7RCN4GOHCBrIVUeXMAQd6MjmCuhMN2JNS4AwGTF5B00cJlaz7QcZFOvTXyf7GUUJ3IDeBOwbLicvKftwh0z2ASfls/ngTQ9gx1KtbEOQyRcon5QfCBQCquvUWOedlLcnvohNjLlOkMAdWr6rOnRUlKmuHKGlss07IgRuu53m+nZSF5ug0FFRQkON5wdpa62iTL5A9cbKZFGMEgKqN5PBm1u7nv6xsMXKolhZPmDT4GmgghbEUja/fHW/CJsveroqD9WVkrpELOp2RrIok69ps9+K5tKP/G22k7r46+8vXxoLAW1o8VyyXPuRuY7dj6Jc1gDPAmUlLzFJXaL6kd8ly59FsbJ8UJurQ2oatnVUu2RlI9yr6murLZNx84t/w21CakbNR36BOErpUT4kZ7+3aMWQXW9zfTuLoo2xQA0E9ovq9yZSHfrSn+03qE62QFwKqRmx+fS7ZBklRtj1q9qcr47wUy4fPHb8ITUNRmEUNNaWgwb6zQBuBbBdRH4I4N8BvGlBa7VMuGJDGw6emAjNVV+KwGEtWqbDBPlB78sXAzt5X3drVbiv4aKJ/lBdPt1U7032lcLhRIRwaHb2RuMTFjnBLg+UtchBIfUq2lCayCon77BJoD3ViPaWykHXnysg09qItubgQwfa99PWuoUviPr7yuQlYQe/DKWT8p4fZJQGytQHKAu4JZNvyKQRdFI+zOQLlJOX9Fc8o3CTr2lbkPY2aqN00AodFeWCoq9TGToqyuRrrq/rXaktDVsQzUn57FBlP7KTutgEhY4Kc02y6+Svj13XqjZn0hUn5Usm35g2m2d5Zjrc5AtYB4N8G5+w62s3rfqKw1D9uQK2hmigStn8hirHWpjJ19TJzuYX24+6K7P5BWVRrLx+5SIdZ/IN0ij354IPfgHBc2R2KHp+8R88LGvdIgRiyyXLpR/ZLllBMext/GNHKeVp9YPLG5esyjYUQ5UexlXP3+agCD92nfzvAEDoe/C7ZJlNWdzYMXPY6Bm9TkU9U1PvUhsiLGhGIPaPBTuLoo053+Mv31AbrOE2dQpSDEW1+dTEDEaKen4phdQMe6al+UI/IxPhJ2psmnqX2hAxB5ciffnaECZwmzYEtTlsfrlYcInC8VMALwDwXAC/B+BypdRjC12x5YDxg94TooXeNzyOGilnLgR05+tKN+JJXyg7EzIuaPIzi4k9SAdjNER93ZWDejBCc2C+PzNTXuAGIrQxunyllttkCAy7finerFenOI24qVOlJiDYD9Uub/tBDsQJ3N2VyUvihcNKgThOaPCflI8TuP0n5eNMvnabDVEmX9M2+2BQNldAa1NdoMnXXF9ZB4MGYhY4v6UhyuQLeH6QVuio0YmZUJMvYIWO8m0CwvqpubdfixZm8gV037az+Q3ki6EmX7tt2VK/iO5HZSFAl983HG7yBXRYx9amulKbJ2fmcDDE5AuUQ0fZmsC4jF+mHxmBeCDC5GvaZmfzy+bCTb4VbfaeTdiBZoP/jEKcybecvMQSuPPFqtBmlW2o1ARGuWOZ8rZLVlBSFxu/S1aUy5q5vr6ubkNYtBuD3yWrlKU15Pr+bH5KqcBwbjb+OdhsVsPmF79lMiipS2WbKwXiuPnCr5QYjAmFZlyyTBvKiqHgNhuXLFsRE2VlBLQixl/+ks6Wqgg/BuOSZSxQpaQuHcHzi7/NSfuRifAT1mZ/HoVySM2IsdMV0C9i1vIB69BuNlcdUvNixCUKRxOAPwTwLgDvAPD73nckhstjDhLuGy5i09rqk7WXrWvFXt9BwiBttcGEvwv21wufaPYPj5e0H1HuD6Y8UBbSow5mAdWCT5x2pZyRTLfTaMSjTDi22dckXYgWJisPBsUtiH7TeDbC5AtUJy8paaBCnqn/pHz5HURNTGVhL87kq9uQKgnEYVkUbfoylaGjjEY8akG06x5l8gWsbH6mfD7aVFdbI9jRWRZkSgfFIvqFndmuZPKNKW+HjjKuRmH4D8llcwXsCDH52m3L5svCoT+pi40/m1+cK4CIeBrfssA9H6Hh1m0o+0Eak29cm+2T8lHWIaDaxWogV8SWKA1Ul78fhZt8geozCi4mX1vYO2yyKMb0C+OSVRK4I55RX6bSJWsgH+7iBlS7ZEW5rAFW8hLHTUZjXS22tjdX9aOojZI9doaLUzg9UR1S06Y3U3bJMvNL3DsAyvNLnMAd1I+CIvwY9HpaYz3TaGFSa4jLAm5QFsWgNpjrhoXUrGiD73xP1PkHU96u+0CugEs6qkNqlurT7d+gB4fUNHSmGitcsuI03ECl/30pi2JMG8zaHBZSs/L6lS5ZYYdvLzZcXDj+HTqRykcBfMz7/+cWslLLha50EzrTjaEHCfd5Ps1+Ll2nQygFRVsImvzqamuwozNVcZBwIF9Ea1NduKm+O4XpuXIcUqMtCXd/qNQo9+cKoQe5AC349FgxM+MEet02bd4zE7FLeWP2DUu64C9v6jI3HxxjuqK88YMcKk80UYtPc4OJ1Vxe1KOeqW5f2locogUloPKkfNymBNCCjwkdFZZFsbI+5cm47GMZXt6EjjL9IsrkC1jZ/ErlC1Wp3YPqVNK6ObS5J1NODe8SjL+0CcgXSibfOK0bgIo2RF2/FDpqqDx2/EldbErJS7yNmDH5hgncQGVYR5ex05spx3Y3Andkm0ubw0JoUpfK61eOnbh+1NZcj0xrY/mZ5grYtKY5UgNln1FwMfn2ZsrJS6LCRdrXNy5Z5SyKDv0oVygndYnQuhmXLLsNUdc3LlmlZxoSw95fJyOImbETJZj0ZFIllyzT/+Les9lwl5K6RGoaK62l2ZBQiIZNPpcs44ISNr/4k5f0D4VH+Cm1uStt1cdhTrVcsqJCapaub1ndSkldHPsRYMZOeHmTzc9ea4Mi/BhMpB+7zUEhNf11MhYotzk4VVqbo0Jqlq9fXnfCkrpcjLgI0FcopV6vlLrX+3kDtBBNHLhifWugC8ecd7hje8Ap+cu6WzE9N195+CPvZUYL8cu8tLsyRrCJ9RunOTTa0jjtrd8ncMA72Rx0kMu+h21WCgvnZjAJGIYLU6VJP+gQZKm8FfInziyu61PeBJQ0UBETU0tjHTau0clL5h00ULoNadguHHGHIOyT8v1DxUgNFFB5Uj7OVGf/biBXLGugIp6pOSmfzRVLSV2iNOImdJSZXONMvqbNttY9SgMFlAXiscmZWJOvuT6gzbcubbZNmnEmX6B8Un4gV0BhcgbHYgRuc1LeHjtx/agnUw4dZUy+YQI3oAVcEzoqzuQL2NrPQuJ+FGfyBfSh3U4vm9/UrElbHu3PaLsbxQmTQOUZBReTr528JCybXWV9yhEXytrb8DbreP667iapS5xAvKW9pRRNJM5lTd+/fDgzLIZ9Zfk0DnoWKJc52HbJcmmzLRy6KABKlsl8sRRSM0o49Ltk6XUqrl9YSol8eISfUvlMCnnPJSsboxgCKueLOCuA/buBfCEywo+h5JKVK1gCd/T8YrtkucRPNnOwCTYQFuHHYOdRMFkUgyL8lK+fbN2x+1FUhJ+LDRcB+qde5A0AgIg8G8BDC1el5cXl69swkC9WBZs/dvoMpmbnQzXQQOVBwsG8jhsbtoj2dacxNDaJ0YmZUri1qInJmAP7bW1JTIe1fQLjtLGAHkS2JtClPKDrNJArYF1buLsEYJt97UEafg8Tq9leEOMEGbMJMBruOLOSOSk/NTuHQYcFzj4p77I42CbNOJMvUJnUxkUzWV9bg0s6tGtM3AEfuw39QwXPLzja5GvKm5PyLpsMO/ay0cZECdz25jAsi6JNKZvfUKF8sNRBIO4fKm/c4vqFOWBmTL7xbS6Hjoo6sGMoHfQZKiA7FG3yBSo1Pv0OGqhSNj+rH8W22bOuuGigdJ30Id/JmTk8NRIegaNU3jqjkI1xu7Hrq9tcxMY11VkUbUpnFIbc5pdVDbXY4sVqdh872hrj4rJm2mAiLoXFsPeXNwfM+2NcB3R9rH6UCw+paTAWqMo2xwjEnalSPwXiIy2YzHZxEX5Kbeguu2T1D8VvVm2XLLOuRQnctuKmfyg4qYtNd2tT6XxP1sEKYFyysrmCk4Zbt0H3IxNS02UsGIE4ydjp997z1pAIP6X6WOH7srnwkJqGtlX16G5tquxHy0QD/UwAPxKRAyJyAMCPATxLRB4XER4mjOHy9a3aZOgLSzdofJoDJmSdQlgqEqoMeqG7wjCTwN6hMZwY16a0KO1tU32tjuE7NIajp422JH4Q7csXcaI4hXxE5IRSnSp26vE+TfZEFmfyBcpmXyNMhiVd8N+jP+emXdFt0H6QxooQtwkwJ+Xvz47otMIRi4m5PqDfm4tm0g4d5aKls5PaZHPxJl9At7Hf0urHL7o6dNQjh087lte//8n+k7EmX7u81nJFm3yByuQl2VwxMKmLTTl5ie4XcSZfQC8QA3k3rRugtS+nJmbwk/0nvc9u/eKRw6dDsygGlc/mCk6bVfukfFgWxep7pEr9KE7gBnQ/GswXnUy+5vqTM/O4f2DEWeAG9CHtsCyKNnY2v6jQZobmhjpsWtOMbF73I5f5xe5HgNt7PnhiAk8cHfU+xykxtEB8/8BIZEjN8vVt60r0YTfAEog9601YSE2DnbyklNQlZn6x+5H+HK+UGBqbxMMHTzmVN2184KkTGCmGh7wrl6+0ZMbNR+VsfkVPwx0/vxjrSjZXCA2paePvRy7jvzA5ix8OnnAqb8bKY0dGcehkeEjN8vXL1hiXseNfm4OyKFbXKVV6B2EhNS82XAToGwBsg47E8QLv/zcAeDmAVyxc1ZYHl683Kb0r3Tj25cMPBdbX1mBHVxp7vUgckzNzOHRyIlIgvtQKNeM66PrOYpBOz83jrj250ue48gBwX/9wZHQJgzntu/f4WOwJefsepYnPweRjBOInjxcC0wpXl9cn5e/02hzXBlOHbz12zKm8OSl/79680yamlLxkqOBk8jV10O85XkAHdL84cuoMfnb4NNq9JDpRmGve9vjx0v3irg+Un1GcBsqclP/xvhM4MT4dq9UzoaO0yTQ8JJW/TqZ8T4zADeiN2Mnxafx4/4lYky9QbqNrm00bzTONe28dXljHR4+M4vDJM04HcHrtfuEQb9WclO8fchO4+zI6cs89e/OxGihzfaD8jOI3Jfq93rlnCDNzwVkUbUzykj3HxiJjTPvrNOC4WQXKGuLdx0adBe65eYXbdw+VPkde39+PYsobl6z7sm5zcEkgHipEhtT012nA2pRECdyA7tvHRifx00OnQ5O6VF5fP/dve2Mh3i0uXVE+rm8bl6z/fuqkDqkZo/QoZ/Mz/cJxrOXLFrSwA8eGvkwao2dm8MPBkcgIP/b1AfexY37/nSeOR4bUNJg8Co8dGY0Mqem/x4Cn9HCZj/oy+uzX3uNjoSE1LzZcwtgdBDAGoA1Au/lRSh30fkci2LR2FdJNdVUHCfcNj2NNcz3WhpjfL+tOlyJxPDUyrg/5RAiU3a1NaG2qw96hQumEs4tm78CJcTx2RNctSkAH7MnbCErRE40xjbsOal0mhXv7h2NPyBt6uvSg2zfsNkh7M9oP8vvZYacF0dT5rj05J4HbuMaYTUacBsqclHfdlAB6ovnRvhNOJl9T/tDJCewdGnO6vllk734y57wpAXSb40y+QFkgLj2jmHuYk/LffdI8I4c6daXwyKHTTiZfU4eR4jQeOXTaceOmy9y1Jxdr8jXXN+XjTL5AeTOZpF/0ZFK4+0m3jZ655t6hMRw+NRGr1TflJ6bn8KN9JxL1o7v25GJNvv7yNYJYgducUXDtR4C2HHx/YNiLMe32nvcPjzsL3D2ZFGbnFb7XP+xUnz6rzevamtAaI3BvbdcWqLscN/TGJcuUd5kvejNpPPDUSRQcBG5T/ujpM9h9bMxxbOprfvfJHMKSutiYdeyuPbnIkJoGs+6Ux068QGzPL25tTuHxo6NeUhe3fnR6YgYPHzzlNNbs+SIqwk/5+uVn5CJwm+zCrv3I3OOevfnIkJr+8tlcITSLYlD5qdl5/GBwxGnsXAy4hLF7F4DHAHwEwAe8n/cvcL2WDSISmJFw33BwBA7Dpeu02erU+HRJII5y4RARXNrdiqznuxYVSszQ160PjHznieM6HnKEvzFQPiTzo30jsX6lgJe1LZMuhXVy1eCYlMROwmS3NvtOz847T3wAvPBv8eXNSfliTLg4gzkpPz4956SB0nXS0UQANyGgJ5PChFfedfIGgMmZeUeBWJeZmI7XiAPlk/IT03OxJl+gnM1vYnrOyeQL6EW03GZ3YQ9wXxwA9zbb5V2sAOak/MT0XOzhW0OP94xcTL6mTkn6RY/nMhGV5azy+uV+4dpPTXmXZ5rysvlNTM85a6BMm11Nvj2Z5P1odl5hes51fknWj4xLln6m8eWNS9bE9JzTnA345gunjZJd3uGZemVcLGj6+tbYcShvXLLMM42bX0T0ujMxPRcb4adUp4Tzy9nMR4B7myv7Ufw7MIqLiem50CyK1ffQ7zkuwo/hbNadqdl5ndRlAeaLiwEXF45fB7BdKXW9UurnvZ8XLnTFlhOXr2/Dk8fHSgH2AWB/nADdrWNIPzk0hoG89gmK8zns606XfFddBBk73qRLhzWHZOYVsMNhIgNQMp9HJRSwsSeXOO2tv7zLILU3IS5tNiflgXhTYLlOqYp/48vr67pooIBKc1uSydj/t2GYk/L+vw2j1tMQA+6pV811XRZEfV19fReTL1D5rpzMh1Z5l/fclS4LL3EmX6DsBwm4H44xdXIx+drXddFAAZXPxcnak7AftXqHdu26xWHGsHt5Xc5V4DbvKirGdND1Abc2G5cs/9+GYQRifX23+cK8K+d+5JWLShpVcf2KOTXh2HEobyxQdt2iMC5ZgPscbJ5lb0QMextz3bgIP6XrJ5wvKvqRw3xhXLJcr2/fw7kfeeXjIvwYzLuKi/Djv779t1H0JOx3FwMuAvQTAFYvcD2WNZevb8XU7Dz2e2HpTk9MY6Q4je1d4Z3wsnVagN57vIB9+SI2ByRc8dPbrQ8S/OzQaacOaJJaAPHuHqV7mInMMcXm2QoNcSfkDUbIdl0QmxvqSimaXetk7uGivQHKk4VrIPi+hM8oqcBtC8QuArc5KZ+kTubZuMbuLAtKyTYZLiZf+7oulhigHDrKtU5aIDYbpYURDkttTjg2XUy+9vUbamtCw2PamGx+9t+63iOpQOzaL3qTbla9Z7nFYT4FtHBRI+7zi3HJsusWh2mzcz/qOrv5Iuk7iIvwY9i0Ricvcb1HjbXhdn1vvaU5OOm6k7wfucwvpt6uAndHqgFrvM2L61pr7pF0/LvPR2fXj+Ii/PjLuwrcxgKVpE6LjYsA/R4Aj4jIHSJyq/lZ6IotJ8oHCbWv8b5hLUhHaaA70/qU/N6hMQzkC7H+yUD5IOH03LzTZG8SsADuE01fQu1HeVC7TpTJrm/iU8clXaisU8p5QbTrknSiWShh0pyUd72+EYijsij6Kb/nZFox101D0vJJNxkmdFRUUhcbEzoq3eRm8rXrkrQNidvsrIFKZgUwoaO2d6WcNFD6Hmlnk69dFxetm7k+cDZaN7fyxiXLtR+ZaEWbI7Io+tGZO5PPL879wnuW7uWTjR2Tzc/1+jq9dbrkt+9C8jafXb9IqqV3rY/J5ucqcBsLlIvrY1WdFmgslPpRwvnF9R0YgdhV4DZ1cYnwc7HgInH8G4D3AngcwHxMWRLA9s4WNNbVYPfRMfzy1dFpuW0u7W7F40d1iKYXXZaJvY89WbgKV33daew5PpZ81+o4iHaub0VTfQ2euWWNU/m25nr83I4OvPDSLqfyAPCqZ2500rgZfuUZG7GubZXzgnjjleu8VKRubX7ejg784s4MruvtcCq/raMFv/KMDXjF09Y7la+rrcHvveAS7PSsFC789nO3oDA561z+Vc/ciLZV9U5uNwBww+Xd2H1sDFdtbHMq/6yta/GyK9c59WtAa9tfc81m/NLVG5zKiwje+ILtzosVAPzWtVtw5NQZpwURAH756g2YVyhpZeN40WUZ/GT/Seza6jYWrtrYhlc8bT1ecnm3U/nVzQ143XO34sWOzxQAfvf522IPxtq85ppNuGpjm7PA/cqnrcfJ8Wls63ATJq/r7cSNV3bjedvdxk5fdxq/cvUG3HjVOqfyTfW1uPm67XiW4zsAgN953jbMzyvn8r+2axM2JtjQ33hlN54aKZbi/8fxnEs6cOOV3bi+r9Op/Ja1zfj1XRvxSsf5pbZGjx1XTSkAvPY5W3BqYtq5/K88YwOaG2pjQ2oafnFnN352+DSu3uT23p65ZQ1edtU657HQmW7Eb127Ga+4yu0ZiQhuvm67kzuZ4Tev3YIDI+NO5x8A4KarN2Bqdt55DnvhpV248cpuXHPJWqfyl69vwyufth43XOE2v6Sb6vH6n9uG63rd+h2g55e4w8M2r37WJly2Lh0b4ediQZSKnhhE5EGl1LPO6uIiNwD4MIBaAJ9SSv1dSLlfBfBVAM9SSj0Udc1du3aphx6KLHJRctPHf4hV9TX40s3PwXu+8yT+9QcHsOedL4lciP7mW3vwqR88BQD4wK89Db/6zI2x93nue+7GsdFJ/PjPX4h1bfED77M/fAp/d/tePPjWFzsdeJucmcNXHjqM/3HNZudFdHRiBq2r6pwFE0IIIYSQiwEReVgptcv/vcsW+X4ReQ+AWwFMmS+VUj+NuWEtgI8D+AUARwA8KCK3KqX2+MqlAfwRgAcc6rJkuXx9K7716DEopbAvP46tHc2xAuhllobR1QWirzuNsclZZzP0b167Bb94ebeT8AxoDc5rn7PVqazB5eAKIYQQQshSwUWAvtr791rrOwUgLhLHNQAGlVL7AUBEvgTgJgB7fOXeBe0i8n8d6rK4XH/9Wf/p5V1Pwxcu+UUcueGXsP/SV6FvYhi4/q8j/+bS5i7gqt8GAGx/7a8B8zOx9/n91AYcamqD/Px7nOpVD8DNaEUIIYQQsoB873uLXQNnYgVopdTPn+W1NwA4bH0+AuDZdgEReQaATUqpb4tIqAAtIjcDuBkANm/efJbVWVyuGNcByx9JrcPBptW48UR/7N/sOHMCtWoe3dMFtDgIzwCwq3gUu4pHz6muhBBCCCEknFgBWkQyAP4WwHql1EtFZCeA5yilPn0uNxaRGgAfBPC6uLJKqU8C+CSgfaDP5b7nxDnsjPpm5lD713fgtl96A+Z2D2H7n7wRuPpvIv+mEcBlH70f3a3rgA+d/b0JIYQQQsj5w8WF47MA/hXAW73PWQBfBhAnQB8FsMn6vNH7zpAGcAWA73mHy7oB3Coir4w7SLgUaaqvxY7OFO7pzwNwy5oFAJ/8n7ucQ8AQQgghhJCFJ1QyExEjXHcopb4CL4SdUmoWwJzDtR8E0CMi20SkAcCroQ8iwrvOqFKqQym1VSm1FcBPACxL4dlw+fpWTM/qSICXOArQ61evQmeCUDmEEEIIIWRhiVJt/rf377iItEMfHISIXAtgNO7CnqD9BwDuAPAkgK8opXaLyDtF5JXnVu2lyeUbdIzcTGtjotirhBBCCCHk4iFKijNBe98MrTneLiI/BNAJ4FUuF1dK3QbgNt93fxVS9nqXay5lLl+vw9K5um8QQgghhJCLjygBulNE3uz9/xvQgrBAx4J+MYDHFrhuy46dFKAJIYQQQpY8UQJ0LYAUyppoQ/PCVWd509pUj3/4jafj6ZtWL3ZVCCGEEELIWRIlQB9XSr3zgtVkhfBLV29Y7CoQQgghhJBzIOoQoV/zTAghhBBCyIonSoB+0QWrBSGEEEIIIUuEUAFaKXXyQlaEEEIIIYSQpQBT3BFCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIoQBNCCCGEEJIACtCEEEIIIYQkgAI0IYQQQgghCaAATQghhBBCSAIWVIAWkRtEpF9EBkXkLQG/f7OI7BGRx0TkbhHZspD1IYQQQggh5FxZMAFaRGoBfBzASwHsBPAaEdnpK/YIgF1KqasAfBXA3y9UfQghhBBCCDkfLKQG+hoAg0qp/UqpaQBfAnCTXUApda9SasL7+BMAGxewPoQQQgghhJwzCylAbwBw2Pp8xPsujNcD+E7QL0TkZhF5SEQeGh4ePo9VJIQQQgghJBkXxSFCEfktALsAvC/o90qpTyqldimldnV2dl7YyhFCCCGEEGJRt4DXPgpgk/V5o/ddBSLyYgBvBfACpdTUAtaHEEIIIYSQc2YhNdAPAugRkW0i0gDg1QButQuIyNUAPgHglUqp/ALWhRBCCCGEkPPCggnQSqlZAH8A4A4ATwL4ilJqt4i8U0Re6RV7H4AUgP8UkZ+JyK0hlyOEEEIIIeSiYCFdOKCUug3Abb7v/sr6/4sX8v6EEEIIIYScby6KQ4SEEEIIIYQsFShAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCRgQQVoEblBRPpFZFBE3hLw+0YR+bL3+wdEZOtC1ocQQgghhJBzZcEEaBGpBfBxAC8FsBPAa0Rkp6/Y6wGcUkrtAPAhAO9dqPoQQgghhBByPlhIDfQ1AAaVUvuVUtMAvgTgJl+ZmwD8m/f/rwJ4kYjIAtaJEEIIIYSQc6JuAa+9AcBh6/MRAM8OK6OUmhWRUQDtAEbsQiJyM4CbvY9FEelfkBrH0wFf3ciyg+94ZcD3vDLge17+8B2vDBbzPW8J+nIhBejzhlLqkwA+udj1EJGHlFK7FrseZOHgO14Z8D2vDPielz98xyuDi/E9L6QLx1EAm6zPG73vAsuISB2ANgAnFrBOhBBCCCGEnBMLKUA/CKBHRLaJSAOAVwO41VfmVgC/7f3/VQDuUUqpBawTIYQQQggh58SCuXB4Ps1/AOAOALUAPqOU2i0i7wTwkFLqVgCfBvA5ERkEcBJayL6YWXQ3ErLg8B2vDPieVwZ8z8sfvuOVwUX3noUKX0IIIYQQQtxhJkJCCCGEEEISQAGaEEIIIYSQBFCAdiAuJTlZmojIJhG5V0T2iMhuEfkj7/u1InKXiAx4/65Z7LqSc0NEakXkERH5lvd5m4g84I3pL3sHnckSRkRWi8hXRWSviDwpIs/hWF5+iMj/8ebrJ0TkiyLSxPG89BGRz4hIXkSesL4LHL+i+Yj3vh8TkWcsRp0pQMfgmJKcLE1mAfyJUmongGsB/L73bt8C4G6lVA+Au73PZGnzRwCetD6/F8CHlFI7AJwC8PpFqRU5n3wYwO1KqUsBPA36fXMsLyNEZAOAPwSwSyl1BXSAgleD43k58FkAN/i+Cxu/LwXQ4/3cDOCfLlAdK6AAHY9LSnKyBFFKHVdK/dT7fwF6wd2AyhTz/wbglxalguS8ICIbAbwMwKe8zwLghQC+6hXhO17iiEgbgOugIztBKTWtlDoNjuXlSB2AVV7uiGYAx8HxvORRSn0fOhqbTdj4vQnAvyvNTwCsFpF1F6SiFhSg4wlKSb5hkepCFggR2QrgagAPAMgopY57vxoCkFmsepHzwj8A+DMA897ndgCnlVKz3meO6aXPNgDDAP7Vc9X5lIi0gGN5WaGUOgrg/QAOQQvOowAeBsfzciVs/F4UchkFaLLiEZEUgK8B+GOl1Jj9Oy+xD2M9LlFE5OUA8kqphxe7LmRBqQPwDAD/pJS6GsA4fO4aHMtLH88H9iboDdN6AC2oNvuTZcjFOH4pQMfjkpKcLFFEpB5aeP68Uurr3tc5Yw7y/s0vVv3IOfM8AK8UkQPQ7lcvhPaVXe2ZgAGO6eXAEQBHlFIPeJ+/Ci1QcywvL14M4Cml1LBSagbA16HHOMfz8iRs/F4UchkF6HhcUpKTJYjnC/tpAE8qpT5o/cpOMf/bAP7rQteNnB+UUn+ulNqolNoKPXbvUUr9JoB7AbzKK8Z3vMRRSg0BOCwifd5XLwKwBxzLy41DAK4VkWZv/jbvmeN5eRI2fm8F8FovGse1AEYtV48LBjMROiAiN0L7UZqU5O9e3BqR84GI/ByA+wE8jrJ/7F9A+0F/BcBmAAcB/LpSyn+4gSwxROR6AH+qlHq5iFwCrZFeC+ARAL+llJpaxOqRc0REng59ULQBwH4AvwOtJOJYXkaIyDsA/AZ0FKVHAPwutP8rx/MSRkS+COB6AB0AcgD+GsAtCBi/3ubpY9DuOxMAfkcp9dAFrzMFaEIIIYQQQtyhCwchhBBCCCEJoABNCCGEEEJIAihAE0IIIYQQkgAK0IQQQgghhCSAAjQhhBBCCCEJoABNCCEXCSLSLiI/836GROSo9/+iiPzjAt73ehF57kJdnxBClht18UUIIYRcCJRSJwA8HQBE5O0Aikqp91+AW18PoAjgRxfgXoQQsuShBpoQQi5yPA3xt7z/v11E/k1E7heRgyLyKyLy9yLyuIjc7qWnh4g8U0TuE5GHReQOKyXuH4rIHhF5TES+JCJbAbwRwP/xtN3PF5FOEfmaiDzo/TzPuvfnROTHIjIgIm9YpEdCCCGLCjXQhBCy9NgO4OcB7ATwYwC/qpT6MxH5BoCXici3AXwUwE1KqWER+Q0A7wbwvwC8BcA2pdSUiKxWSp0WkX+Gpe0WkS8A+JBS6gcishnAHQAu8+59FYBrAbQAeEREvq2UOnbBWk4IIRcBFKAJIWTp8R2l1IyIPA6gFsDt3vePA9gKoA/AFQDu0llvUQvguFfmMQCfF5FboFPlBvFiADu9vwWAVhFJef//L6XUGQBnROReANdEXIcQQpYlFKAJIWTpMQUASql5EZlRSinv+3noeV0A7FZKPSfgb18G4DoArwDwVhG5MqBMDYBrlVKT9peeQK18Zf2fCSFk2UMfaEIIWX70A+gUkecAgIjUi8jlIlIDYJNS6l4A/w9AG4AUgAKAtPX3dwJ4k/kgIk+3fneTiDSJSDv04cMHF7IhhBByMUIBmhBClhlKqWkArwLwXhF5FMDPADwX2pXjPzzXj0cAfEQpdRrANwH8sjlECOAPAezyDhrugT5kaHgMwL0AfgLgXfR/JoSsRKRs+SOEEELCucCh9Qgh5KKFGmhCCCGEEEISQA00IYQQQgghCaAGmhBCCCGEkARQgCaEEEIIISQBFKAJIYQQQghJAAVoQgghhBBCEkABmhBCCCGEkAT8f/kKXJ92qHBTAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
          " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "### Initialize\n", + "environment.reset()\n", + "\n", + "## Creation of the environment via Environment.create() creates\n", + "## a wrapper class around the original Environment defined here.\n", + "## That wrapper mainly keeps track of the number of timesteps.\n", + "## In order to alter the attributes of your instance of the original\n", + "## class, like to set the initial temp to a custom value, like here,\n", + "## you need to access the `environment` member of this wrapped class.\n", + "## That is why you see the way to set the current_temp like below.\n", + "environment.current_temp = np.array([1.0])\n", + "states = environment.current_temp\n", + "\n", + "internals = agent.initial_internals()\n", + "terminal = False\n", + "\n", + "### Run an episode\n", + "temp = [environment.current_temp[0]]\n", + "while not terminal:\n", + " actions, internals = agent.act(states=states, internals=internals, independent=True)\n", + " states, terminal, reward = environment.execute(actions=actions)\n", + " temp += [states[0]]\n", + "\n", + "### Plot the run\n", + "plt.figure(figsize=(12, 4))\n", + "ax=plt.subplot()\n", + "ax.set_ylim([0.0, 1.0])\n", + "plt.plot(range(len(temp)), temp)\n", + "plt.hlines(y=0.4, xmin=0, xmax=99, color='r')\n", + "plt.hlines(y=0.6, xmin=0, xmax=99, color='r')\n", + "plt.xlabel('Timestep')\n", + "plt.ylabel('Temperature')\n", + "plt.title('Temperature vs. Timestep')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/threaded_ale.py b/examples/threaded_ale.py deleted file mode 100644 index 041b3f246..000000000 --- a/examples/threaded_ale.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Arcade Learning Environment execution. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from copy import deepcopy - -from six.moves import xrange -import argparse -import logging -import os -import sys -import time -import numpy as np - -from tensorforce import TensorForceError -from tensorforce.agents import agents as AgentsDictionary, Agent -import json -from tensorforce.execution import ThreadedRunner -from tensorforce.contrib.ale import ALE -from tensorforce.execution.threaded_runner import WorkerAgentGenerator - -""" -To replicate the Asynchronous Methods for Deep Reinforcement Learning paper (https://arxiv.org/abs/1602.01783) -Nstep DQN: - python threaded_ale.py breakout.bin -a configs/dqn_visual.json -n - configs/cnn_dqn2013_network.json -fs 4 -ea -w 4 - - - Note: batch_size in the config should be set to n+1 where n is the desired number of steps -""" - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('rom', help="File path of the rom") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-w', '--workers', help="Number of threads to run where the model is shared", type=int, default=16) - parser.add_argument('-fs', '--frame-skip', help="Number of frames to repeat action", type=int, default=1) - parser.add_argument('-rap', '--repeat-action-probability', help="Repeat action probability", type=float, default=0.0) - parser.add_argument('-lolt', '--loss-of-life-termination', help="Loss of life counts as terminal state", action='store_true') - parser.add_argument('-lolr', '--loss-of-life-reward', help="Loss of life reward/penalty. EX: -1 to penalize", type=float, default=0.0) - parser.add_argument('-ea', '--epsilon-annealing', help='Create separate epislon annealing schedules per thread', action='store_true') - parser.add_argument('-ds', '--display-screen', action='store_true', default=False, help="Display emulator screen") - parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") - parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") - parser.add_argument('-s', '--save', help="Save agent to this dir") - parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - - args = parser.parse_args() - - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) # configurable!!! - logger.addHandler(logging.StreamHandler(sys.stdout)) - - environments = [ALE(args.rom, frame_skip=args.frame_skip, - repeat_action_probability=args.repeat_action_probability, - loss_of_life_termination=args.loss_of_life_termination, - loss_of_life_reward=args.loss_of_life_reward, - display_screen=args.display_screen) for _ in range(args.workers)] - - if args.network_spec: - with open(args.network_spec, 'r') as fp: - network_spec = json.load(fp=fp) - else: - network_spec = None - logger.info("No network configuration provided.") - - agent_configs = [] - if args.agent_config is not None: - with open(args.agent_config, 'r') as fp: - agent_config = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - for i in range(args.workers): - worker_config = deepcopy(agent_config) - - # Optionally overwrite epsilon final values - if "explorations_spec" in worker_config and worker_config['explorations_spec']['type'] == "epsilon_anneal": - if args.epsilon_annealing: - # epsilon final values are [0.5, 0.1, 0.01] with probabilities [0.3, 0.4, 0.3] - epsilon_final = np.random.choice([0.5, 0.1, 0.01], p=[0.3, 0.4, 0.3]) - worker_config['explorations_spec']["epsilon_final"] = epsilon_final - - agent_configs.append(worker_config) - - # Let the first agent create the model - # Manually assign model - logger.info(agent_configs[0]) - - agent = Agent.from_spec( - spec=agent_configs[0], - kwargs=dict( - states_spec=environments[0].states, - actions_spec=environments[0].actions, - network_spec=network_spec - ) - ) - - agents = [agent] - - for i in xrange(args.workers - 1): - config = agent_configs[i] - agent_type = config.pop('type', None) - worker = WorkerAgentGenerator(AgentsDictionary[agent_type])( - states_spec=environments[0].states, - actions_spec=environments[0].actions, - network_spec=network_spec, - model=agent.model, - **config - ) - agents.append(worker) - - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_configs[0]) - - if args.save: - save_dir = os.path.dirname(args.save) - if not os.path.isdir(save_dir): - try: - os.mkdir(save_dir, 0o755) - except OSError: - raise OSError("Cannot save agent to dir {} ()".format(save_dir)) - - def episode_finished(stats): - if args.debug: - logger.info( - "Thread {t}. Finished episode {ep} after {ts} timesteps. Reward {r}". - format(t=stats['thread_id'], ep=stats['episode'], ts=stats['timestep'], r=stats['episode_reward']) - ) - return True - - def summary_report(r): - et = time.time() - logger.info('=' * 40) - logger.info('Current Step/Episode: {}/{}'.format(r.global_step, r.global_episode)) - logger.info('SPS: {}'.format(r.global_step / (et - r.start_time))) - reward_list = r.episode_rewards - if len(reward_list) > 0: - logger.info('Max Reward: {}'.format(np.max(reward_list))) - logger.info("Average of last 500 rewards: {}".format(sum(reward_list[-500:]) / 500)) - logger.info("Average of last 100 rewards: {}".format(sum(reward_list[-100:]) / 100)) - logger.info('=' * 40) - - # Create runners - threaded_runner = ThreadedRunner( - agents, - environments, - repeat_actions=1, - save_path=args.save, - save_episodes=args.save_episodes - ) - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environments[0])) - threaded_runner.run(summary_interval=100, episode_finished=episode_finished, summary_report=summary_report) - threaded_runner.close() - logger.info("Learning finished. Total episodes: {ep}".format(ep=threaded_runner.global_episode)) - - -if __name__ == '__main__': - main() diff --git a/examples/unreal_engine.py b/examples/unreal_engine.py deleted file mode 100644 index f335cf3f2..000000000 --- a/examples/unreal_engine.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Test an Unreal Engine Game as RL-Environment -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import json -import logging -import os -import sys -import time -import random -from PIL import Image - -from tensorforce import TensorForceError -from tensorforce.agents import Agent -from tensorforce.execution import Runner -from tensorforce.contrib.unreal_engine import UE4Environment - - -# Users need to give the port on which the UE4 Game listens on for incoming RL-client connections. -# To learn about setting up UE4 Games as RL-environments, go to: https://github.com/ducandu/engine2learn -# - you will need to install the UE4 Engine and the engine2learn plugin -# - supports headless execution of UE4 games under Linux - -# python examples/unreal_engine.py 6025 -a examples/configs/vpg.json -# -n examples/configs/mlp2_network.json -e 50000 -m 2000 - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('-P', '--port', default=6025, - help="Port on which the UE4 Game listens on for incoming RL-client connections") - parser.add_argument('-H', '--host', default=None, help="Hostname of the UE4 Game (default: localhost)") - parser.add_argument('-a', '--agent-config', help="Agent configuration file") - parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") - parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") - parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") - parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, - help="Maximum number of timesteps per episode") - parser.add_argument('-d', '--deterministic', action='store_true', default=False, - help="Choose actions deterministically") - parser.add_argument('-l', '--load', help="Load agent from this dir") - parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") - parser.add_argument('-R', '--random-test-run', action="store_true", help="Do a quick random test run on the env") - - args = parser.parse_args() - - # logging.basicConfig(filename="logfile.txt", level=logging.INFO) - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - - # We have to connect this remote env to get the specs. - # We also discretize axis-mappings b/c we will use a deep q-network. - # Use num_ticks==6 to match Nature paper by Mnih et al. - # ("human cannot press fire button with more than 10Hz", dt=1/60) - # TODO: Need to build in capturing and concat'ing last 4 images (plus 8-bit conversion!) into 1 input state signal. - # TODO: Use pre-processor for that. - environment = UE4Environment(host=args.host, port=args.port, connect=True, discretize_actions=True, num_ticks=6) - environment.seed(200) - - # Do a quick random test-run with image capture of the first n images -> then exit after 1000 steps. - if args.random_test_run: - # Reset the env. - s = environment.reset() - img_format = "RGB" if len(environment.states["shape"]) == 3 else "L" - img = Image.fromarray(s, img_format) - # Save first received image as a sanity-check. - img.save("reset.png") - for i in range(1000): - s, is_terminal, r = environment.execute(actions=random.choice(range(environment.actions["num_actions"]))) - if i < 10: - img = Image.fromarray(s, img_format) - img.save("{:03d}.png".format(i)) - logging.debug("i={} r={} term={}".format(i, r, is_terminal)) - if is_terminal: - environment.reset() - quit() - - if args.agent_config is not None: - with open(args.agent_config, 'r') as fp: - agent_config = json.load(fp=fp) - else: - raise TensorForceError("No agent configuration provided.") - - if args.network_spec is not None: - with open(args.network_spec, 'r') as fp: - network_spec = json.load(fp=fp) - else: - network_spec = None - logger.info("No network configuration provided.") - - agent = Agent.from_spec( - spec=agent_config, - kwargs=dict( - states=environment.states, - actions=environment.actions, - network=network_spec - ) - ) - if args.load: - load_dir = os.path.dirname(args.load) - if not os.path.isdir(load_dir): - raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) - agent.restore_model(args.load) - - if args.debug: - logger.info("-" * 16) - logger.info("Configuration:") - logger.info(agent_config) - - runner = Runner( - agent=agent, - environment=environment, - repeat_actions=1 - ) - - if args.debug: # TODO: Timestep-based reporting - report_episodes = 1 - else: - report_episodes = 100 - - logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) - - def episode_finished(r, id_): - if r.episode % report_episodes == 0: - steps_per_second = r.global_timestep / (time.time() - r.start_time) - logger.info("Finished episode {} after {} timesteps. SPS={}".format( - r.global_episode, r.episode_timestep, steps_per_second - )) - logger.info("Episode reward: {}".format(r.episode_rewards[-1])) - logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / - min(500, len(r.episode_rewards)))) - logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / - min(100, len(r.episode_rewards)))) - return True - - runner.run( - timesteps=args.timesteps, - episodes=args.episodes, - max_episode_timesteps=args.max_episode_timesteps, - deterministic=args.deterministic, - episode_finished=episode_finished - ) - runner.close() - - logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode)) - - -if __name__ == '__main__': - main() diff --git a/examples/vectorized_environment.py b/examples/vectorized_environment.py new file mode 100644 index 000000000..4be89d03e --- /dev/null +++ b/examples/vectorized_environment.py @@ -0,0 +1,107 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np + +from tensorforce import Environment, Runner + + +class VectorizedEnvironment(Environment): + """ + Example vectorized environment, illustrating best-practice implementation pattern. + + State space: position in [0, 10]. + Action space: movement in {-1, 0, 1}. + Random start in [0, 3] or [7, 10]. + Positive reward for moving towards the center 5. + """ + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='int', num_values=11) + + def actions(self): + return dict(type='int', num_values=3) + + def is_vectorizable(self): + return True # Indicates that environment is vectorizable + + def reset(self, num_parallel=None): + # Always for vectorized environments: initialize parallel indices + self._is_parallel = (num_parallel is not None) + if self._is_parallel: + self._parallel_indices = np.arange(num_parallel) + else: + self._parallel_indices = np.arange(1) + + # Vectorized environment logic + is_high = (np.random.random_sample(size=self._parallel_indices.shape) < 0.5) + offset = np.random.randint(4, size=self._parallel_indices.shape) + self._states = np.where(is_high, 10 - offset, offset) + + # Always for vectorized environments: return un-/vectorized values + if self._is_parallel: + return self._parallel_indices.copy(), self._states.copy() + else: + return self._states[0] + + def execute(self, actions): + # Always for vectorized environments: expand actions if non-vectorized + if not self._is_parallel: + actions = np.expand_dims(actions, axis=0) + + # Vectorized environment logic + reward = np.select( + condlist=[self._states < 5, self._states > 5], + choicelist=[(actions == 2).astype(np.float32), (actions == 0).astype(np.float32)], + default=np.ones(shape=self._parallel_indices.shape, dtype=np.float32) + ) + terminal = (np.random.random_sample(size=self._parallel_indices.shape) < 0.1) + self._states = np.clip(self._states + (actions - 1), a_min=0, a_max=10) + + # Always for vectorized environments: update parallel indices and states, + # and return un-/vectorized values + if self._is_parallel: + self._parallel_indices = self._parallel_indices[~terminal] + self._states = self._states[~terminal] + return self._parallel_indices.copy(), self._states.copy(), terminal, reward + else: + return self._states[0], terminal.item(), reward.item() + + +def main(): + # Non-vectorized runner + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=VectorizedEnvironment, + max_episode_timesteps=10 + ) + runner.run(num_episodes=1000) + + # Vectorized runner, automatically if num_parallel > 1 and environment.is_vectorizable() + # (and remote argument not specified) + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=VectorizedEnvironment, + max_episode_timesteps=10, + num_parallel=16 + ) + runner.run(num_episodes=1000) + + +if __name__ == '__main__': + main() diff --git a/requirements-all.txt b/requirements-all.txt new file mode 100644 index 000000000..bf7b0e3bf --- /dev/null +++ b/requirements-all.txt @@ -0,0 +1,21 @@ +gym >= 0.21.0, <0.23 +h5py >= 3.6.0 +matplotlib >= 3.5.1 +msgpack >= 1.0.3 +msgpack-numpy >= 0.4.7.1 +numpy ~= 1.21.5 +Pillow >= 9.0.0 +tensorflow == 2.12.1 +tqdm >= 4.62.3 +tensorflow-addons >= 0.15.0 +hpbandster >= 0.7.4 +ale-py >= 0.7.3 +gym[box2d,classic_control] >= 0.21.0 +box2d >= 2.3.10 +gym-retro >= 0.8.0 +vizdoom >= 1.1.11 +m2r >= 0.2.1 +recommonmark >= 0.7.1 +sphinx >= 4.3.2 +sphinx-rtd-theme >= 1.0.0 +pytest >= 6.2.5 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..1d9ed7f19 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +gym >= 0.21.0, <0.23 +h5py >= 3.6.0 +matplotlib >= 3.5.1 +msgpack >= 1.0.3 +msgpack-numpy >= 0.4.7.1 +numpy ~= 1.21.5 +Pillow >= 9.0.0 +tensorflow == 2.12.1 +tqdm >= 4.62.3 diff --git a/run.py b/run.py new file mode 100644 index 000000000..77922406e --- /dev/null +++ b/run.py @@ -0,0 +1,258 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import argparse +import importlib +import json +import os + +import matplotlib +import numpy as np + +from tensorforce import Environment, Runner + +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + + +def main(): + parser = argparse.ArgumentParser(description='Tensorforce runner') + # Agent arguments + parser.add_argument( + '-a', '--agent', type=str, default=None, + help='Agent (name, configuration JSON file, or library module)' + ) + parser.add_argument( + '-c', '--checkpoints', type=str, default=None, + help='TensorFlow checkpoints directory, plus optional comma-separated filename' + ) + parser.add_argument( + '-s', '--summaries', type=str, default=None, + help='TensorBoard summaries directory, plus optional comma-separated filename' + ) + parser.add_argument( + '--recordings', type=str, default=None, help='Traces recordings directory' + ) + # Environment arguments + parser.add_argument( + '-e', '--environment', type=str, default=None, + help='Environment (name, configuration JSON file, or library module)' + ) + parser.add_argument( + '-l', '--level', type=str, default=None, + help='Level or game id, like `CartPole-v1`, if supported' + ) + parser.add_argument( + '-m', '--max-episode-timesteps', type=int, default=None, + help='Maximum number of timesteps per episode' + ) + parser.add_argument( + '--visualize', action='store_true', + help='Visualize agent--environment interaction, if supported' + ) + parser.add_argument( + '--visualize-directory', type=str, default=None, + help='Directory to store videos of agent--environment interaction, if supported' + ) + parser.add_argument( + '--import-modules', type=str, default=None, + help='Import comma-separated modules required for environment' + ) + # Parallel execution arguments + parser.add_argument( + '--num-parallel', type=int, default=None, + help='Number of environment instances to execute in parallel' + ) + parser.add_argument( + '--batch-agent-calls', action='store_true', + help='Batch agent calls for parallel environment execution' + ) + parser.add_argument( + '--sync-timesteps', action='store_true', + help='Synchronize parallel environment execution on timestep-level' + ) + parser.add_argument( + '--sync-episodes', action='store_true', + help='Synchronize parallel environment execution on episode-level' + ) + parser.add_argument( + '--remote', type=str, choices=('multiprocessing', 'socket-client', 'socket-server'), + default=None, help='Communication mode for remote environment execution of parallelized' + 'environment execution' + ) + parser.add_argument( + '--blocking', action='store_true', help='Remote environments should be blocking' + ) + parser.add_argument( + '--host', type=str, default=None, + help='Socket server hostname(s) or IP address(es), single value or comma-separated list' + ) + parser.add_argument( + '--port', type=str, default=None, + help='Socket server port(s), single value or comma-separated list, increasing sequence if' + 'single host and port given' + ) + # Runner arguments + parser.add_argument( + '-v', '--evaluation', action='store_true', + help='Run environment (last if multiple) in evaluation mode' + ) + parser.add_argument('-n', '--episodes', type=int, default=None, help='Number of episodes') + parser.add_argument('-t', '--timesteps', type=int, default=None, help='Number of timesteps') + parser.add_argument('-u', '--updates', type=int, default=None, help='Number of agent updates') + parser.add_argument( + '--mean-horizon', type=int, default=1, + help='Number of episodes progress bar values and evaluation score are averaged over' + ) + parser.add_argument( + '--save-best-agent', type=str, default=None, + help='Directory to save the best version of the agent according to the evaluation score' + ) + # Logging arguments + parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of repetitions') + parser.add_argument( + '--path', type=str, default=None, + help='Logging path, directory plus filename without extension' + ) + parser.add_argument('--seaborn', action='store_true', help='Use seaborn') + args = parser.parse_args() + + if args.import_modules is not None: + for module in args.import_modules.split(','): + importlib.import_module(name=module) + + if args.path is None: + callback = None + + else: + assert os.path.splitext(args.path)[1] == '' + assert args.episodes is not None and args.visualize is not None + rewards = [list() for _ in range(args.episodes)] + timesteps = [list() for _ in range(args.episodes)] + seconds = [list() for _ in range(args.episodes)] + agent_seconds = [list() for _ in range(args.episodes)] + + def callback(r, p): + rewards[r.episodes - 1].append(float(r.episode_returns[-1])) + timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1])) + seconds[r.episodes - 1].append(float(r.episode_seconds[-1])) + agent_seconds[r.episodes - 1].append(float(r.episode_agent_seconds[-1])) + return True + + if args.environment is None: + environment = None + else: + environment = dict(environment=args.environment) + if args.level is not None: + environment['level'] = args.level + if args.visualize: + environment['visualize'] = True + if args.visualize_directory is not None: + environment['visualize_directory'] = args.visualize_directory + + if args.host is not None and ',' in args.host: + args.host = args.host.split(',') + if args.port is not None and ',' in args.port: + args.port = [int(x) for x in args.port.split(',')] + elif args.port is not None: + args.port = int(args.port) + + if args.remote == 'socket-server': + Environment.create( + environment=environment, max_episode_timesteps=args.max_episode_timesteps, + remote=args.remote, port=args.port + ) + return + + if args.agent is None: + assert args.saver is None and args.summarizer is None and args.recorder is None + agent = None + else: + agent = dict(agent=args.agent) + if args.checkpoints is not None: + assert 'saver' not in agent + if ',' in args.checkpoints: + directory, filename = args.checkpoints.split(',') + agent['saver'] = dict(directory=directory, filename=filename) + else: + agent['saver'] = args.checkpoints + if args.summaries is not None: + assert 'summarizer' not in agent + if ',' in args.summaries: + directory, filename = args.summaries.split(',') + agent['summarizer'] = dict(directory=directory, filename=filename) + else: + agent['summarizer'] = args.summaries + if args.recordings is not None: + assert 'recorder' not in agent + agent['recorder'] = args.recordings + + for _ in range(args.repeat): + runner = Runner( + agent=agent, environment=environment, max_episode_timesteps=args.max_episode_timesteps, + evaluation=args.evaluation, num_parallel=args.num_parallel, remote=args.remote, + blocking=args.blocking, host=args.host, port=args.port + ) + runner.run( + num_episodes=args.episodes, num_timesteps=args.timesteps, num_updates=args.updates, + batch_agent_calls=args.batch_agent_calls, sync_timesteps=args.sync_timesteps, + sync_episodes=args.sync_episodes, callback=callback, mean_horizon=args.mean_horizon, + save_best_agent=args.save_best_agent + ) + runner.close() + + if args.path is not None: + directory = os.path.split(args.path)[0] + if directory != '' and not os.path.isdir(directory): + os.makedirs(directory, exist_ok=True) + + with open(args.path + '.json', 'w') as filehandle: + filehandle.write( + json.dumps(dict( + rewards=rewards, timesteps=timesteps, seconds=seconds, + agent_seconds=agent_seconds + )) + ) + + if args.seaborn: + import seaborn as sns + sns.set() + + xs = np.arange(len(rewards)) + figure, axis1 = plt.subplots() + axis1.set_xlabel('episodes') + axis2 = axis1.twinx() + + min_timesteps = np.amin(timesteps, axis=1) + max_timesteps = np.amax(timesteps, axis=1) + median_timesteps = np.median(timesteps, axis=1) + axis2.plot(xs, median_timesteps, color='blue', linewidth=2.0) + axis2.fill_between(xs, min_timesteps, max_timesteps, color='blue', alpha=0.4) + axis2.set_ylabel('episode length', color='blue') + + min_rewards = np.amin(rewards, axis=1) + max_rewards = np.amax(rewards, axis=1) + median_rewards = np.median(rewards, axis=1) + axis1.plot(xs, median_rewards, color='green', linewidth=2.0) + axis1.fill_between(xs, min_rewards, max_rewards, color='green', alpha=0.4) + axis1.set_ylabel('episode return', color='green') + + figure.tight_layout() + plt.savefig(fname=(args.path + '.png')) + + +if __name__ == '__main__': + main() diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index b88034e41..000000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -description-file = README.md diff --git a/setup.py b/setup.py index eeb7a2073..4b1e96509 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,53 +13,159 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import os +from setuptools import find_packages, setup +import sys + + +""" +cd docs; make html; cd ..; + +pip install --upgrade -r requirements-all.txt +# ... update requirements.txt and setup.py ... + +# Check "before update" notes + +# Update __version__ in tensorforce/__init__.py, and UPDATE_NOTES.md + +rm -r build +rm -r dist +rm -r docs/_* +pip install --upgrade pip setuptools wheel twine +python setup.py sdist bdist_wheel + +twine upload --repository-url https://test.pypi.org/legacy/ dist/Tensorforce-0.6.X* + +deactivate +cd .. +source [XYZ] + +pip install --upgrade -r tensorforce/requirements.txt +pip install --upgrade --index-url https://test.pypi.org/simple/ tensorforce +python + > import tensorforce + > print(tensorforce.__version__) + > quit() +python tensorforce/examples/quickstart.py + +deactivate +source [XYZ] +cd tensorforce + +git status +git add -u +git commit -m "Fix PyPI version 0.6.X" +git push origin master + +# Wait for a while to check Travis is not failing right away (installation, first tests) + +twine upload dist/Tensorforce-0.6.X* + +# Fix Github release +""" + +if sys.version_info.major != 3: + raise NotImplementedError("Tensorforce is only compatible with Python 3.") + +tensorforce_directory = os.path.abspath(os.path.dirname(__file__)) + +# Extract version from tensorforce/__init__.py +version = None +with open(os.path.join(tensorforce_directory, 'tensorforce', '__init__.py'), 'r') as filehandle: + for line in filehandle: + if line.startswith('__version__ = \'') and line.endswith('\'\n'): + version = line[15:-2] +assert version is not None + +# Extract long_description from README.md introduction +long_description = list() +with open(os.path.join(tensorforce_directory, 'README.md'), 'r') as filehandle: + lines = iter(filehandle) + line = next(lines) + if not line.startswith('# Tensorforce:'): + raise NotImplementedError + long_description.append(line) + for line in lines: + if line == '#### Introduction\n': + break + if next(lines) != '\n': + raise NotImplementedError + while True: + line = next(lines) + if line == '\n': + line = next(lines) + if line == '\n': + break + else: + long_description.append('\n') + long_description.append(line) + else: + long_description.append(line) + while line == '\n': + line = next(lines) + if not line.startswith('#### '): + raise NotImplementedError +assert len(long_description) > 0 +long_description.append('\n') +long_description.append('For more information, see the [GitHub project page](https://github.com/ten' + 'sorforce/tensorforce) and [ReadTheDocs documentation](https://tensorforce.' + 'readthedocs.io/en/latest/).\n') +long_description = ''.join(long_description) + +# Find packages +packages = find_packages(exclude=('test',)) +assert all(package.startswith('tensorforce') for package in packages) + +# Extract install_requires from requirements.txt +install_requires = list() +with open(os.path.join(tensorforce_directory, 'requirements.txt'), 'r') as filehandle: + for line in filehandle: + line = line.strip() + if line: + install_requires.append(line) +assert len(install_requires) > 0 + +# Readthedocs requires Sphinx extensions to be specified as part of install_requires. +if os.environ.get('READTHEDOCS', None) == 'True': + install_requires.append('recommonmark') -from setuptools import setup, find_packages - -install_requires = [ - 'numpy', - 'six', - 'scipy', - 'pillow', - 'pytest' -] - -setup_requires = [ - 'numpy', - 'recommonmark' -] - -extras_require = { - 'tf': ['tensorflow>=1.4.0'], - 'tf_gpu': ['tensorflow-gpu>=1.4.0'], - 'gym': ['gym==0.9.5'], - 'universe': ['universe>=0.21.3'], - 'mazeexp': ['mazeexp>=0.0.1'], - 'ue4': ['msgpack-python', 'msgpack-numpy'] -} - -# Readthedocs requires Sphinx extensions to be specified as part of -# install_requires in order to build properly. -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' -if on_rtd: - install_requires.extend(setup_requires) - - -setup(name='tensorforce', - version='0.3.5.1', # please remember to edit tensorforce/__init__.py when updating the version - description='Reinforcement learning for TensorFlow', - url='http://github.com/reinforceio/tensorforce', - download_url='https://github.com/reinforceio/tensorforce/archive/0.3.5.1.tar.gz', - author='reinforce.io', - author_email='contact@reinforce.io', - license='Apache 2.0', - packages=[package for package in find_packages() if package.startswith('tensorforce')], - install_requires=install_requires, - setup_requires=setup_requires, - extras_require=extras_require, - zip_safe=False) +setup( + name='Tensorforce', + version=version, + description='Tensorforce: a TensorFlow library for applied reinforcement learning', + long_description=long_description, + long_description_content_type='text/markdown', + author='Alexander Kuhnle', + author_email='tensorforce.team@gmail.com', + url='http://github.com/tensorforce/tensorforce', + packages=packages, + download_url='https://github.com/tensorforce/tensorforce/archive/{}.tar.gz'.format(version), + license='Apache 2.0', + python_requires='>=3.7', + classifiers=[ + 'Natural Language :: English', + 'Topic :: Scientific/Engineering', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8' + ], + install_requires=install_requires, + extras_require=dict( + tfa=['tensorflow-addons >= 0.15.0'], + tune=['hpbandster >= 0.7.4'], + envs=[ + 'ale-py >= 0.7.3', 'gym[atari,box2d,classic_control] >= 0.21.0', 'box2d >= 2.3.10', + 'gym-retro >= 0.8.0', 'vizdoom == 1.1.11' + ], + ale=['ale-py >= 0.7.3'], + gym=['gym[box2d,classic_control] >= 0.21.0', 'box2d >= 2.3.10'], + retro=['gym-retro >= 0.8.0'], + vizdoom=['vizdoom >= 1.1.11'], + carla=['pygame', 'opencv-python'], + docs=[ + 'm2r >= 0.2.1', 'recommonmark >= 0.7.1', 'sphinx >= 4.3.2', 'sphinx-rtd-theme >= 1.0.0' + ], + tests=['pytest >= 6.2.5'] + ), + zip_safe=False +) diff --git a/tensorforce/__init__.py b/tensorforce/__init__.py index 2d47a6a14..6704ea0f7 100755 --- a/tensorforce/__init__.py +++ b/tensorforce/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,26 +13,20 @@ # limitations under the License. # ============================================================================== +import logging +import os -from tensorforce.exception import TensorForceError - - -__version__ = '0.3.5.1' - +if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ: + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -# Libraries should add NullHandler() by default, as its the application code's -# responsibility to configure log handlers. -# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library +from tensorforce.exception import TensorforceError +from tensorforce.environments import Environment +from tensorforce.agents import Agent +from tensorforce.execution import Runner -import logging -try: - from logging import NullHandler -except ImportError: - class NullHandler(logging.Handler): - def emit(self, record): - pass +__all__ = ['Agent', 'Environment', 'Runner', 'TensorforceError'] -logging.getLogger(__name__).addHandler(NullHandler()) +__version__ = '0.6.5' -__all__ = ['TensorForceError'] +logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/tensorforce/agents/__init__.py b/tensorforce/agents/__init__.py index 0af78d76e..9bf5a3972 100755 --- a/tensorforce/agents/__init__.py +++ b/tensorforce/agents/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,48 +13,61 @@ # limitations under the License. # ============================================================================== +from tensorforce.agents.recorder import Recorder + from tensorforce.agents.agent import Agent -from tensorforce.agents.constant_agent import ConstantAgent -from tensorforce.agents.random_agent import RandomAgent -from tensorforce.agents.learning_agent import LearningAgent -from tensorforce.agents.dqfd_agent import DQFDAgent -from tensorforce.agents.dqn_agent import DQNAgent -from tensorforce.agents.dqn_nstep_agent import DQNNstepAgent -from tensorforce.agents.naf_agent import NAFAgent -from tensorforce.agents.ppo_agent import PPOAgent -from tensorforce.agents.trpo_agent import TRPOAgent -from tensorforce.agents.vpg_agent import VPGAgent -from tensorforce.agents.ddpg_agent import DDPGAgent -# from tensorforce.agents.categorical_dqn_agent import CategoricalDQNAgent + +from tensorforce.agents.constant import ConstantAgent +from tensorforce.agents.random import RandomAgent +from tensorforce.agents.tensorforce import TensorforceAgent + +from tensorforce.agents.a2c import AdvantageActorCritic +from tensorforce.agents.ac import ActorCritic +from tensorforce.agents.dpg import DeterministicPolicyGradient +from tensorforce.agents.double_dqn import DoubleDQN +from tensorforce.agents.dqn import DeepQNetwork +from tensorforce.agents.dueling_dqn import DuelingDQN +from tensorforce.agents.ppo import ProximalPolicyOptimization +from tensorforce.agents.trpo import TrustRegionPolicyOptimization +from tensorforce.agents.vpg import VanillaPolicyGradient + + +A2C = A2CAgent = AdvantageActorCritic +AC = ACAgent = ActorCritic +Constant = ConstantAgent +DPG = DDPG = DPGAgent = DeterministicPolicyGradient +DDQN = DoubleDQNAgent = DoubleDQN +DQN = DQNAgent = DeepQNetwork +DuelingDQNAgent = DuelingDQN +PPO = PPOAgent = ProximalPolicyOptimization +Random = RandomAgent +Tensorforce = TensorforceAgent +TRPO = TRPOAgent = TrustRegionPolicyOptimization +VPG = REINFORCE = VPGAgent = VanillaPolicyGradient agents = dict( - constant_agent=ConstantAgent, - random_agent=RandomAgent, - dqfd_agent=DQFDAgent, - dqn_agent=DQNAgent, - dqn_nstep_agent=DQNNstepAgent, - naf_agent=NAFAgent, - ppo_agent=PPOAgent, - trpo_agent=TRPOAgent, - vpg_agent=VPGAgent, - ddpg_agent=DDPGAgent - # CategoricalDQNAgent=CategoricalDQNAgent, + a2c=AdvantageActorCritic, ac=ActorCritic, constant=ConstantAgent, + ddpg=DeterministicPolicyGradient, ddqn=DoubleDQN, default=TensorforceAgent, + dpg=DeterministicPolicyGradient, double_dqn=DoubleDQN, dqn=DeepQNetwork, dueling_dqn=DuelingDQN, + tensorforce=TensorforceAgent, ppo=ProximalPolicyOptimization, random=RandomAgent, + recorder=Recorder, reinforce=VanillaPolicyGradient, trpo=TrustRegionPolicyOptimization, + vpg=VanillaPolicyGradient ) __all__ = [ - 'Agent', - 'ConstantAgent', - 'RandomAgent', - 'LearningAgent', - 'DQFDAgent', - 'DQNAgent', - 'DQNNstepAgent', - 'NAFAgent', - 'PPOAgent', - 'TRPOAgent', - 'VPGAgent', - 'DDPGAgent', - 'agents' + 'Agent', 'agents', + 'A2C', 'A2CAgent', 'AdvantageActorCritic', + 'AC', 'ACAgent', 'ActorCritic', + 'Constant', 'ConstantAgent', + 'DPG', 'DDPG', 'DPGAgent', 'DeterministicPolicyGradient', + 'DDQN', 'DoubleDQNAgent', 'DoubleDQN', + 'DQN', 'DQNAgent', 'DeepQNetwork', + 'DuelingDQN', 'DuelingDQNAgent', + 'PPO', 'PPOAgent', 'ProximalPolicyOptimization', + 'Random', 'RandomAgent', + 'Tensorforce', 'TensorforceAgent', + 'TRPO', 'TRPOAgent', 'TrustRegionPolicyOptimization', + 'VPG', 'REINFORCE', 'VPGAgent', 'VanillaPolicyGradient' ] diff --git a/tensorforce/agents/a2c.py b/tensorforce/agents/a2c.py new file mode 100644 index 000000000..e8b1dce6f --- /dev/null +++ b/tensorforce/agents/a2c.py @@ -0,0 +1,238 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class AdvantageActorCritic(TensorforceAgent): + """ + [Advantage Actor-Critic](https://arxiv.org/abs/1602.01783) agent + (specification key: `a2c`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
            +
          • type ("bool" | "int" | "float") – state data type + (default: "float").
          • +
          • shape (int | iter[int]) – state shape + (required).
          • +
          • num_values (int > 0) – number of discrete state values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
          • +
          + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
            +
          • type ("bool" | "int" | "float") – action data type + (required).
          • +
          • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
          • +
          • num_values (int > 0) – number of discrete action values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
          • +
          + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + + memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (default: minimum capacity, usually does not + need to be changed). + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + + horizon ("episode" | parameter, int >= 0): Horizon + of discounted-sum reward estimation before critic estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + advantage_processing (specification): Advantage processing as layer or list of layers, see + the [preprocessing documentation](../modules/preprocessing.html) + (default: no advantage processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + critic (specification): Critic network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto"). + critic_optimizer (float > 0.0 | specification): Critic optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), a float instead specifies a + custom weight for the critic loss + (default: 1.0). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

          + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', use_beta_distribution=False, + # Memory + memory='minimum', + # Optimization + update_frequency=1.0, learning_rate=1e-3, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + advantage_processing=None, + predict_terminal_values=False, + # Critic + critic='auto', critic_optimizer=1.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='A2C', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'critic_network' in kwargs: + raise TensorforceError.deprecated( + name='A2C', argument='critic_network', replacement='critic' + ) + + self.spec = OrderedDict( + agent='a2c', + states=states, actions=actions, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, use_beta_distribution=use_beta_distribution, + memory=memory, + update_frequency=update_frequency, learning_rate=learning_rate, + horizon=horizon, discount=discount, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values, + critic=critic, critic_optimizer=critic_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=1.0, + use_beta_distribution=use_beta_distribution + ) + + if memory == 'minimum': + memory = dict(type='recent') + else: + memory = dict(type='recent', capacity=memory) + + update = dict(unit='timesteps', batch_size=batch_size, frequency=update_frequency) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = 'policy_gradient' + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='early', + estimate_advantage=True, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = dict(type='parametrized_state_value', network=critic) + baseline_objective = dict(type='state_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=critic_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/ac.py b/tensorforce/agents/ac.py new file mode 100644 index 000000000..2df57a259 --- /dev/null +++ b/tensorforce/agents/ac.py @@ -0,0 +1,232 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class ActorCritic(TensorforceAgent): + """ + Actor-Critic agent (specification key: `ac`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
            +
          • type ("bool" | "int" | "float") – state data type + (default: "float").
          • +
          • shape (int | iter[int]) – state shape + (required).
          • +
          • num_values (int > 0) – number of discrete state values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
          • +
          + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
            +
          • type ("bool" | "int" | "float") – action data type + (required).
          • +
          • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
          • +
          • num_values (int > 0) – number of discrete action values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
          • +
          + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + + memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (default: minimum capacity, usually does not + need to be changed). + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + + horizon (parameter, int >= 1): Horizon of + discounted-sum reward estimation before critic estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + critic (specification): Critic network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto"). + critic_optimizer (float > 0.0 | specification): Critic optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), a float instead specifies a + custom weight for the critic loss + (default: 1.0). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

          + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', use_beta_distribution=False, + # Memory + memory='minimum', + # Optimization + update_frequency=1.0, learning_rate=1e-3, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + predict_terminal_values=False, + # Critic + critic='auto', critic_optimizer=1.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='AC', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'critic_network' in kwargs: + raise TensorforceError.deprecated( + name='AC', argument='critic_network', replacement='critic' + ) + + self.spec = OrderedDict( + agent='ac', + states=states, actions=actions, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, use_beta_distribution=use_beta_distribution, + memory=memory, + update_frequency=update_frequency, learning_rate=learning_rate, + horizon=horizon, discount=discount, return_processing=return_processing, + predict_terminal_values=predict_terminal_values, + critic=critic, critic_optimizer=critic_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=1.0, + use_beta_distribution=use_beta_distribution + ) + + if memory == 'minimum': + memory = dict(type='recent') + else: + memory = dict(type='recent', capacity=memory) + + update = dict(unit='timesteps', batch_size=batch_size, frequency=update_frequency) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = 'policy_gradient' + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='early', + estimate_advantage=False, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = dict(type='parametrized_state_value', network=critic) + baseline_objective = dict(type='state_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=critic_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/agent.py b/tensorforce/agents/agent.py index b91c3f7f0..6480b2575 100755 --- a/tensorforce/agents/agent.py +++ b/tensorforce/agents/agent.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,282 +13,735 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from copy import deepcopy +from collections import OrderedDict +import json +import logging +import os +import random import numpy as np +import tensorflow as tf -from tensorforce import util, TensorForceError +from tensorforce import util, TensorforceError +from tensorforce.agents import Recorder import tensorforce.agents +from tensorforce.core import ArrayDict, TensorSpec, TensorforceConfig -class Agent(object): +class Agent(Recorder): """ - Base class for TensorForce agents. + Tensorforce agent interface. """ - def __init__( - self, - states, - actions, - batched_observe=True, - batching_capacity=1000 - ): + @staticmethod + def create(agent='tensorforce', environment=None, **kwargs): """ - Initializes the agent. + Create an agent from a specification. Args: - states (spec, or dict of specs): States specification, with the following attributes - (required): - - type: one of 'bool', 'int', 'float' (default: 'float'). - - shape: integer, or list/tuple of integers (required). - actions (spec, or dict of specs): Actions specification, with the following attributes - (required): - - type: one of 'bool', 'int', 'float' (required). - - shape: integer, or list/tuple of integers (default: []). - - num_actions: integer (required if type == 'int'). - - min_value and max_value: float (optional if type == 'float', default: none). - batched_observe (bool): Specifies whether calls to model.observe() are batched, for - improved performance (default: true). - batching_capacity (int): Batching capacity of agent and model (default: 1000). + agent (specification | Agent class/object | callable[states -> actions]): JSON file, + specification key, configuration dictionary, library module, or `Agent` + class/object. Alternatively, an act-function mapping states to actions which is + supposed to be recorded. + (default: Tensorforce base agent). + environment (Environment object): Environment which the agent is supposed to be trained + on, environment-related arguments like state/action space specifications and + maximum episode length will be extract if given + (recommended). + kwargs: Additional agent arguments. """ + if isinstance(agent, Recorder): + if environment is not None: + # TODO: + # assert agent.spec['states'] == environment.states() + # assert agent.spec['actions'] == environment.actions() + # assert environment.max_episode_timesteps() is None or \ + # agent.spec['max_episode_timesteps'] >= environment.max_episode_timesteps() + pass + + for key, value in kwargs.items(): + if key == 'parallel_interactions': + assert agent.spec[key] >= value + else: + assert agent.spec[key] == value + + if agent.is_initialized: + agent.reset() + else: + agent.initialize() + + return agent + + elif (isinstance(agent, type) and issubclass(agent, Agent)) or callable(agent): + # Type specification, or Recorder + if environment is not None: + if 'states' in kwargs: + # TODO: + # assert kwargs['states'] == environment.states() + pass + else: + kwargs['states'] = environment.states() + if 'actions' in kwargs: + # assert kwargs['actions'] == environment.actions() + pass + else: + kwargs['actions'] = environment.actions() + if environment.max_episode_timesteps() is None: + pass + elif 'max_episode_timesteps' in kwargs: + # assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps() + pass + else: + kwargs['max_episode_timesteps'] = environment.max_episode_timesteps() + + if isinstance(agent, type) and issubclass(agent, Agent): + agent = agent(**kwargs) + assert isinstance(agent, Agent) + else: + if 'recorder' not in kwargs: + raise TensorforceError.required(name='Recorder', argument='recorder') + agent = Recorder(fn_act=agent, **kwargs) + return Agent.create(agent=agent, environment=environment) + + elif isinstance(agent, dict): + # Dictionary specification + agent.update(kwargs) + kwargs = dict(agent) + agent = kwargs.pop('agent', kwargs.pop('type', 'default')) + + return Agent.create(agent=agent, environment=environment, **kwargs) + + elif isinstance(agent, str): + if os.path.isfile(agent): + # JSON file specification + with open(agent, 'r') as fp: + agent = json.load(fp=fp) + return Agent.create(agent=agent, environment=environment, **kwargs) + + elif agent in tensorforce.agents.agents: + # Keyword specification + agent = tensorforce.agents.agents[agent] + return Agent.create(agent=agent, environment=environment, **kwargs) - self.set_normalized_states(states=states) - self.set_normalized_actions(actions=actions) - - # Batched observe for better performance with Python. - self.batched_observe = batched_observe - self.batching_capacity = batching_capacity - if self.batched_observe: - assert self.batching_capacity is not None - self.observe_terminal = list() - self.observe_reward = list() - - self.current_states = None - self.current_actions = None - self.current_internals = None - self.next_internals = None - self.current_terminal = None - self.current_reward = None - self.timestep = None - self.episode = None - - self.model = self.initialize_model() - self.reset() + else: + # Library specification + _agent = util.try_import_module(module=agent, parent_class=Agent) + if _agent is not None: + return Agent.create(agent=_agent, environment=environment, **kwargs) + + raise TensorforceError.value(name='Agent.create', argument='agent', value=agent) + + else: + raise TensorforceError.type(name='Agent.create', argument='agent', dtype=type(agent)) + + @staticmethod + def load(directory=None, filename=None, format=None, environment=None, **kwargs): + """ + Restores an agent from a directory/file. + + Args: + directory (str): Checkpoint directory + (required, unless saver is specified). + filename (str): Checkpoint filename, with or without append and extension + (default: "agent"). + format ("checkpoint" | "numpy" | "hdf5"): File format + (default: format matching directory and + filename, required to be unambiguous). + environment (Environment object): Environment which the agent is supposed to be trained + on, environment-related arguments like state/action space specifications and + maximum episode length will be extract if given + (recommended). + kwargs: Additional agent arguments. + """ + if directory is not None: + if filename is None: + filename = 'agent' + agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') + if not os.path.isfile(agent) and agent[agent.rfind('-') + 1: -5].isdigit(): + agent = agent[:agent.rindex('-')] + '.json' + if os.path.isfile(agent): + with open(agent, 'r') as fp: + agent = json.load(fp=fp) + if 'agent' in kwargs: + if 'agent' in agent and agent['agent'] != kwargs['agent']: + raise TensorforceError.value( + name='Agent.load', argument='agent', value=kwargs['agent'] + ) + agent['agent'] = kwargs.pop('agent') + else: + agent = kwargs + kwargs = dict() + else: + agent = kwargs + kwargs = dict() + + # Overwrite values + if agent.get('max_episode_timesteps') is None: + if environment is not None and environment.max_episode_timesteps() is not None: + if 'max_episode_timesteps' in kwargs: + assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps() + agent['max_episode_timesteps'] = kwargs['max_episode_timesteps'] + else: + agent['max_episode_timesteps'] = environment.max_episode_timesteps() + if 'parallel_interactions' in kwargs and kwargs['parallel_interactions'] > 1: + agent['parallel_interactions'] = kwargs['parallel_interactions'] + + agent.pop('internals', None) + agent.pop('initial_internals', None) + saver_restore = False + if 'saver' in agent and isinstance(agent['saver'], dict): + if not agent.get('load', True): + raise TensorforceError.value( + name='Agent.load', argument='saver[load]', value=agent['saver']['load'] + ) + agent['saver'] = dict(agent['saver']) + agent['saver']['load'] = True + saver_restore = True + elif 'saver' in kwargs and isinstance(kwargs['saver'], dict): + if not kwargs.get('load', True): + raise TensorforceError.value( + name='Agent.load', argument='saver[load]', value=kwargs['saver']['load'] + ) + kwargs['saver'] = dict(kwargs['saver']) + kwargs['saver']['load'] = True + saver_restore = True + agent = Agent.create(agent=agent, environment=environment, **kwargs) + if not saver_restore: + agent.restore(directory=directory, filename=filename, format=format) + + return agent + + def __init__( + self, states, actions, max_episode_timesteps=None, parallel_interactions=1, config=None, + recorder=None + ): + util.overwrite_staticmethod(obj=self, function='create') + util.overwrite_staticmethod(obj=self, function='load') + + # Check whether spec attribute exists + if not hasattr(self, 'spec'): + raise TensorforceError.required_attribute(name='Agent', attribute='spec') + + # Tensorforce config + if config is None: + config = dict() + self.config = TensorforceConfig(**config) + + # TensorFlow logging + tf.get_logger().setLevel(self.config.tf_log_level) + + # TensorFlow eager mode + if self.config.eager_mode: + tf.config.run_functions_eagerly(run_eagerly=True) + + # Random seed + if self.config.seed is not None: + random.seed(a=self.config.seed) + np.random.seed(seed=self.config.seed) + tf.random.set_seed(seed=self.config.seed) + + super().__init__( + fn_act=None, states=states, actions=actions, + max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, recorder=recorder + ) def __str__(self): - return str(self.__class__.__name__) + return self.__class__.__name__ + + def get_specification(self): + """ + Returns the agent specification. + + Returns: + dict: Agent specification. + """ + return dict(self.spec) + + def get_architecture(self): + """ + Returns a string representation of the network layer architecture (policy, baseline, + state-preprocessing). + + Returns: + str: String representation of network architecture. + """ + return self.model.get_architecture() + + def initialize(self): + """ + Initializes the agent. Automatically triggered as part of Agent.create/load. + """ + super().initialize() + + # Initialize model + if not hasattr(self, 'model'): + raise TensorforceError.required_attribute(name='Agent', attribute='model') + self.model.initialize() + + # Value space specifications + assert self.states_spec == self.model.states_spec + self.internals_spec = self.model.internals_spec + self.auxiliaries_spec = self.model.auxiliaries_spec + assert self.actions_spec == self.model.actions_spec + assert self.terminal_spec == self.model.terminal_spec + assert self.reward_spec == self.model.reward_spec + assert self.parallel_spec == self.model.parallel_spec + self.deterministic_spec = self.model.deterministic_spec + + # Parallel observe buffers + self.terminal_buffer = [list() for _ in range(self.parallel_interactions)] + self.reward_buffer = [list() for _ in range(self.parallel_interactions)] + + # Store agent spec as JSON + if self.model.saver is not None: + path = os.path.join(self.model.saver_directory, self.model.saver_filename + '.json') + try: + with open(path, 'w') as fp: + spec = OrderedDict(self.spec) + spec['internals'] = self.internals_spec + spec['initial_internals'] = self.initial_internals() + json.dump(obj=spec, fp=fp, cls=TensorforceJSONEncoder) + except BaseException: + try: + with open(path, 'w') as fp: + spec = OrderedDict() + spec['states'] = self.spec['states'] + spec['actions'] = self.spec['actions'] + spec['internals'] = self.internals_spec + spec['initial_internals'] = self.initial_internals() + json.dump(obj=spec, fp=fp, cls=TensorforceJSONEncoder) + except BaseException: + os.remove(path) + raise + + # Reset model + timesteps, episodes, updates = self.model.reset() + self.timesteps = timesteps.numpy().item() + self.episodes = episodes.numpy().item() + self.updates = updates.numpy().item() def close(self): + """ + Closes the agent. + """ + super().close() self.model.close() + del self.model - def set_normalized_states(self, states): - # Leave incoming states dict intact. - self.states = deepcopy(states) - - # Unique state shortform. - self.unique_state = ('shape' in self.states) - if self.unique_state: - self.states = dict(state=self.states) - - # Normalize states. - for name, state in self.states.items(): - # Convert int to unary tuple. - if isinstance(state['shape'], int): - state['shape'] = (state['shape'],) - - # Set default type to float. - if 'type' not in state: - state['type'] = 'float' - - def set_normalized_actions(self, actions): - # Leave incoming spec-dict intact. - self.actions = deepcopy(actions) - - # Unique action shortform. - self.unique_action = ('type' in self.actions) - if self.unique_action: - self.actions = dict(action=self.actions) - - # Normalize actions. - for name, action in self.actions.items(): - # Set default type to int - if 'type' not in action: - action['type'] = 'int' - - # Check required values - if action['type'] == 'int': - if 'num_actions' not in action: - raise TensorForceError("Action requires value 'num_actions' set!") - elif action['type'] == 'float': - if ('min_value' in action) != ('max_value' in action): - raise TensorForceError("Action requires both values 'min_value' and 'max_value' set!") - - # Set default shape to empty tuple (single-int, discrete action space) - if 'shape' not in action: - action['shape'] = () - - # Convert int to unary tuple - if isinstance(action['shape'], int): - action['shape'] = (action['shape'],) - - def initialize_model(self): + def reset(self): """ - Creates the model for the respective agent based on specifications given by user. This is a separate - call after constructing the agent because the agent constructor has to perform a number of checks - on the specs first, sometimes adjusting them e.g. by converting to a dict. + Resets possibly inconsistent internal values, for instance, after saving and restoring an + agent. Automatically triggered as part of Agent.create/load/initialize/restore. """ - raise NotImplementedError + super().reset() - def reset(self): + # Reset observe buffers + for buffer in self.terminal_buffer: + buffer.clear() + for buffer in self.reward_buffer: + buffer.clear() + + # Reset model + timesteps, episodes, updates = self.model.reset() + self.timesteps = timesteps.numpy().item() + self.episodes = episodes.numpy().item() + self.updates = updates.numpy().item() + + if self.model.saver is not None: + self.model.save() + + def initial_internals(self): + """ + Returns the initial internal agent state(s), to be used at the beginning of an episode as + `internals` argument for `act()` in independent mode + + Returns: + dict[internal]: Dictionary containing initial internal agent state(s). + """ + return self.model.initial_internals.to_dict() + + def tracked_tensors(self): """ - Reset the agent to its initial state (e.g. on experiment start). Updates the Model's internal episode and - time step counter, internal states, and resets preprocessors. + Returns the current value of all tracked tensors (as specified by "tracking" agent + argument). Note that not all tensors change at every timestep. + + Returns: + dict[values]: Dictionary containing the current value of all tracked tensors. """ - self.episode, self.timestep, self.next_internals = self.model.reset() - self.current_internals = self.next_internals + return self.model.tracked_tensors() - def act(self, states, deterministic=False, independent=False, fetch_tensors=None): + def act( + self, states, internals=None, parallel=0, independent=False, deterministic=True, + # Deprecated + evaluation=None + ): """ - Return action(s) for given state(s). States preprocessing and exploration are applied if - configured accordingly. + Returns action(s) for the given state(s), needs to be followed by `observe()` unless + independent mode. + + See the [act-observe script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_observe_interface.py) + for an example application as part of the act-observe interface. Args: - states (any): One state (usually a value tuple) or dict of states if multiple states are expected. - deterministic (bool): If true, no exploration and sampling is applied. - independent (bool): If true, action is not followed by observe (and hence not included - in updates). - fetch_tensors (list): Optional String of named tensors to fetch + states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on + (required). + internals (dict[internal] | iter[dict[internal]]): Dictionary containing current + internal agent state(s), either given by `initial_internals()` at the beginning of + an episode or as return value of the preceding `act()` call + (required if independent mode and agent + has internal states). + parallel (int | iter[int]): Parallel execution index + (default: 0). + independent (bool): Whether this act() call is not part of the training + agent-environment interaction and thus not followed by observe(), meaning its + inputs/outputs/internals are not stored in memory and not used in updates, e.g. for + independent evaluation episodes which should not be learned from + (default: false). + deterministic (bool): Whether action should be chosen deterministically, so no + action distribution sampling and no exploration, only valid in independent mode + (default: true). + Returns: - Scalar value of the action or dict of multiple actions the agent wants to execute. - (fetched_tensors) Optional dict() with named tensors fetched + dict[action] | iter[dict[action]], dict[internal] | iter[dict[internal]] if `internals` + argument given: Dictionary containing action(s), dictionary containing next internal + agent state(s) if independent mode. """ - self.current_internals = self.next_internals + if evaluation is not None: + raise TensorforceError.deprecated( + name='Agent.act', argument='evaluation', replacement='independent' + ) - if self.unique_state: - self.current_states = dict(state=np.asarray(states)) + return super().act( + states=states, internals=internals, parallel=parallel, independent=independent, + deterministic=deterministic + ) + + def fn_act( + self, states, internals, parallel, independent, deterministic, is_internals_none, + num_parallel + ): + + # Separate auxiliaries + def function(name, spec): + auxiliary = ArrayDict() + if self.config.enable_int_action_masking and spec.type == 'int' and \ + spec.num_values is not None: + if name is None: + name = 'action' + # Mask, either part of states or default all true + auxiliary['mask'] = states.pop(name + '_mask', np.ones( + shape=(num_parallel,) + spec.shape + (spec.num_values,), dtype=spec.np_type() + )) + return auxiliary + + auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) + if self.states_spec.is_singleton() and not states.is_singleton(): + states[None] = states.pop('state') + + # Inputs to tensors + states = self.states_spec.to_tensor(value=states, batched=True, name='Agent.act states') + if independent and not is_internals_none: + internals = self.internals_spec.to_tensor( + value=internals, batched=True, recover_empty=True, name='Agent.act internals' + ) + auxiliaries = self.auxiliaries_spec.to_tensor( + value=auxiliaries, batched=True, name='Agent.act auxiliaries' + ) + if independent: + deterministic = self.deterministic_spec.to_tensor( + value=deterministic, batched=False, name='Agent.act deterministic' + ) else: - self.current_states = {name: np.asarray(state) for name, state in states.items()} - - if fetch_tensors is not None: - # Retrieve action - self.current_actions, self.next_internals, self.timestep, self.fetched_tensors = self.model.act( - states=self.current_states, - internals=self.current_internals, - deterministic=deterministic, - independent=independent, - fetch_tensors=fetch_tensors + parallel = self.parallel_spec.to_tensor( + value=parallel, batched=True, name='Agent.act parallel' + ) + + # Model.act() + if not independent: + actions, timesteps = self.model.act( + states=states, auxiliaries=auxiliaries, parallel=parallel ) + self.timesteps = timesteps.numpy().item() - if self.unique_action: - return self.current_actions['action'], self.fetched_tensors + elif len(self.internals_spec) > 0: + if len(self.auxiliaries_spec) > 0: + actions, internals = self.model.independent_act( + states=states, internals=internals, auxiliaries=auxiliaries, + deterministic=deterministic + ) else: - return self.current_actions, self.fetched_tensors + assert len(auxiliaries) == 0 + actions, internals = self.model.independent_act( + states=states, internals=internals, deterministic=deterministic + ) else: - # Retrieve action - self.current_actions, self.next_internals, self.timestep = self.model.act( - states=self.current_states, - internals=self.current_internals, - deterministic=deterministic, - independent=independent + if len(self.auxiliaries_spec) > 0: + actions = self.model.independent_act( + states=states, auxiliaries=auxiliaries, deterministic=deterministic + ) + else: + assert len(auxiliaries) == 0 + actions = self.model.independent_act(states=states, deterministic=deterministic) + + # Outputs from tensors + actions = self.actions_spec.from_tensor( + tensor=actions, batched=True, name='Agent.act output actions' + ) + if independent and len(self.internals_spec) > 0: + internals = self.internals_spec.from_tensor( + tensor=internals, batched=True, name='Agent.act output internals' ) - if self.unique_action: - return self.current_actions['action'] - else: - return self.current_actions + if self.model.saver is not None: + self.model.save() + + return actions, internals - def observe(self, terminal, reward): + def observe(self, reward=0.0, terminal=False, parallel=0): """ - Observe experience from the environment to learn from. Optionally pre-processes rewards - Child classes should call super to get the processed reward - EX: terminal, reward = super()... + Observes reward and whether a terminal state is reached, needs to be preceded by `act()`. + + See the [act-observe script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_observe_interface.py) + for an example application as part of the act-observe interface. Args: - terminal (bool): boolean indicating if the episode terminated after the observation. - reward (float): scalar reward that resulted from executing the action. + reward (float | iter[float]): Reward + (default: 0.0). + terminal (bool | 0 | 1 | 2 | iter[...]): Whether a terminal state is reached, or 2 if + the episode was aborted + (default: false). + parallel (int, iter[int]): Parallel execution index + (default: 0). + + Returns: + int: Number of performed updates. """ - self.current_terminal = terminal - self.current_reward = reward - - if self.batched_observe: - # Batched observe for better performance with Python. - self.observe_terminal.append(self.current_terminal) - self.observe_reward.append(self.current_reward) - - if self.current_terminal or len(self.observe_terminal) >= self.batching_capacity: - self.episode = self.model.observe( - terminal=self.observe_terminal, - reward=self.observe_reward - ) - self.observe_terminal = list() - self.observe_reward = list() + reward, terminal, parallel = super().observe( + reward=reward, terminal=terminal, parallel=parallel + ) - else: - self.episode = self.model.observe( - terminal=self.current_terminal, - reward=self.current_reward + # Process per parallel interaction + num_updates = 0 + for p, t, r in zip(parallel.tolist(), terminal.tolist(), reward.tolist()): + + # Buffer inputs + self.terminal_buffer[p].append(t) + self.reward_buffer[p].append(r) + + # Continue if not terminal and buffer_observe + if t == 0 and ( + self.config.buffer_observe == 'episode' or + len(self.terminal_buffer[p]) < self.config.buffer_observe + ): + continue + + # Buffered terminal/reward inputs + ts = np.asarray(self.terminal_buffer[p], dtype=self.terminal_spec.np_type()) + rs = np.asarray(self.reward_buffer[p], dtype=self.reward_spec.np_type()) + self.terminal_buffer[p].clear() + self.reward_buffer[p].clear() + + # Inputs to tensors + terminal_tensor = self.terminal_spec.to_tensor( + value=ts, batched=True, name='Agent.observe terminal' + ) + reward_tensor = self.reward_spec.to_tensor( + value=rs, batched=True, name='Agent.observe reward' + ) + parallel_tensor = self.parallel_spec.to_tensor( + value=p, batched=False, name='Agent.observe parallel' ) - def should_stop(self): - return self.model.monitored_session.should_stop() + # Model.observe() + updated, episodes, updates = self.model.observe( + terminal=terminal_tensor, reward=reward_tensor, parallel=parallel_tensor + ) + num_updates += int(updated.numpy().item()) + self.episodes = episodes.numpy().item() + self.updates = updates.numpy().item() - def last_observation(self): - return dict( - states=self.current_states, - internals=self.current_internals, - actions=self.current_actions, - terminal=self.current_terminal, - reward=self.current_reward - ) + if self.model.saver is not None: + self.model.save() + + return num_updates - def save_model(self, directory=None, append_timestep=True): + def save(self, directory, filename=None, format='checkpoint', append=None): """ - Save TensorFlow model. If no checkpoint directory is given, the model's default saver - directory is used. Optionally appends current timestep to prevent overwriting previous - checkpoint files. Turn off to be able to load model from the same given path argument as - given here. + Saves the agent to a checkpoint. Args: - directory (str): Optional checkpoint directory. - append_timestep (bool): Appends the current timestep to the checkpoint file if true. - If this is set to True, the load path must include the checkpoint timestep suffix. - For example, if stored to models/ and set to true, the exported file will be of the - form models/model.ckpt-X where X is the last timestep saved. The load path must - precisely match this file name. If this option is turned off, the checkpoint will - always overwrite the file specified in path and the model can always be loaded under - this path. + directory (str): Checkpoint directory + (required). + filename (str): Checkpoint filename, without extension + (default: agent name). + format ("checkpoint" | "saved-model" | "numpy" | "hdf5"): File format, "checkpoint" + uses the [TensorFlow Checkpoint](https://www.tensorflow.org/guide/checkpoint) to + save the model, "saved-model" uses the + [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) to save an + optimized act-only model (use only if you really need TF's SavedModel format, + loading not supported), whereas the others store only variables as NumPy/HDF5 file + (default: TensorFlow Checkpoint). + append ("timesteps" | "episodes" | "updates"): Append timestep/episode/update to + checkpoint filename + (default: none). Returns: - Checkpoint path were the model was saved. + str: Checkpoint path. """ - return self.model.save(directory=directory, append_timestep=append_timestep) - - def restore_model(self, directory=None, file=None): + # TODO: Messes with required parallel disentangling, better to remove unfinished episodes + # from memory, but currently entire episode buffered anyway... + # Empty buffers before saving + # for parallel in range(self.parallel_interactions): + # if self.buffer_indices[parallel] > 0: + # self.model_observe(parallel=parallel) + os.makedirs(directory, exist_ok=True) + path = self.model.save(directory=directory, filename=filename, format=format, append=append) + + if filename is None: + filename = self.model.name + spec_path = os.path.join(directory, filename + '.json') + try: + with open(spec_path, 'w') as fp: + spec = OrderedDict(self.spec) + spec['internals'] = self.internals_spec + spec['initial_internals'] = self.initial_internals() + json.dump(obj=spec, fp=fp, cls=TensorforceJSONEncoder) + except BaseException: + logging.warning("Some agent argument could not be encoded to JSON as part of " + "Agent.save(), likely a Python object or class. Either specify as " + "module string or dictionary, or pass the full set of agent arguments " + "when loading via Agent.load().") + try: + with open(spec_path, 'w') as fp: + spec = OrderedDict() + spec['states'] = self.spec['states'] + spec['actions'] = self.spec['actions'] + spec['internals'] = self.internals_spec + spec['initial_internals'] = self.initial_internals() + json.dump(obj=spec, fp=fp, cls=TensorforceJSONEncoder) + except BaseException: + os.remove(spec_path) + + return path + + def restore(self, directory=None, filename=None, format=None): """ - Restore TensorFlow model. If no checkpoint file is given, the latest checkpoint is - restored. If no checkpoint directory is given, the model's default saver directory is - used (unless file specifies the entire path). + Restores the agent from a checkpoint. Args: - directory: Optional checkpoint directory. - file: Optional checkpoint file, or path if directory not given. + directory (str): Checkpoint directory + (required). + filename (str): Checkpoint filename, with or without append and extension + (default: agent name). + format ("checkpoint" | "numpy" | "hdf5"): File format + (default: format matching directory and + filename, required to be unambiguous). """ - self.model.restore(directory=directory, file=file) + if not hasattr(self, 'model'): + raise TensorforceError(message="Missing agent attribute model.") + + if not self.is_initialized: + self.initialize() + + if filename is None: + filename = self.model.name + + # format implicitly given if file exists + if format is None and os.path.isfile(os.path.join(directory, filename)): + if '.data-' in filename: + filename = filename[:filename.index('.data-')] + format = 'checkpoint' + elif filename.endswith('.npz'): + filename = filename[:-4] + format = 'numpy' + elif filename.endswith('.hdf5'): + filename = filename[:-5] + format = 'hdf5' + elif filename.endswith('.h5'): + filename = filename[:-3] + format = 'hdf5' + else: + assert False + elif format is None and os.path.isfile(os.path.join(directory, filename + '.index')): + format = 'checkpoint' + elif format is None and os.path.isfile(os.path.join(directory, filename + '.npz')): + format = 'numpy' + elif format is None and ( + os.path.isfile(os.path.join(directory, filename + '.hdf5')) or + os.path.isfile(os.path.join(directory, filename + '.h5')) + ): + format = 'hdf5' - @staticmethod - def from_spec(spec, kwargs): - """ - Creates an agent from a specification dict. - """ - agent = util.get_object( - obj=spec, - predefined_objects=tensorforce.agents.agents, - kwargs=kwargs + else: + # infer format from directory + found = None + latest = -1 + for name in os.listdir(directory): + if format in (None, 'numpy') and name == filename + '.npz': + assert found is None + found = 'numpy' + latest = None + elif format in (None, 'numpy') and name.startswith(filename) and \ + name.endswith('.npz'): + assert found is None or found == 'numpy' + found = 'numpy' + n = int(name[len(filename) + 1: -4]) + if n > latest: + latest = n + elif format in (None, 'hdf5') and \ + (name == filename + '.hdf5' or name == filename + '.h5'): + assert found is None + found = 'hdf5' + latest = None + elif format in (None, 'hdf5') and name.startswith(filename) and \ + (name.endswith('.hdf5') or name.endswith('.h5')): + assert found is None or found == 'hdf5' + found = 'hdf5' + n = int(name[len(filename) + 1: -5]) + if n > latest: + latest = n + + if latest == -1: + if format is None: + format = 'checkpoint' + else: + assert format == 'checkpoint' + if not os.path.isfile(os.path.join(directory, filename + '.index')): + import tensorflow as tf + path = tf.train.latest_checkpoint(checkpoint_dir=directory) + if not path: + raise TensorforceError.exists_not(name='Checkpoint', value=directory) + _directory, filename = os.path.split(path) + assert _directory == directory + + else: + if format is None: + format = found + else: + assert format == found + if latest is not None: + filename = filename + '-' + str(latest) + + self.timesteps, self.episodes, self.updates = self.model.restore( + directory=directory, filename=filename, format=format ) - assert isinstance(agent, Agent) - return agent + + +class TensorforceJSONEncoder(json.JSONEncoder): + """ + Custom JSON encoder which is NumPy-compatible. + """ + + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, TensorSpec): + return obj.json() + else: + return super().default(obj) diff --git a/tensorforce/agents/constant.py b/tensorforce/agents/constant.py new file mode 100644 index 000000000..01ea5a08e --- /dev/null +++ b/tensorforce/agents/constant.py @@ -0,0 +1,134 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce.agents import Agent +from tensorforce.core.models import ConstantModel + + +class ConstantAgent(Agent): + """ + Agent returning constant action values (specification key: `constant`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
            +
          • type ("bool" | "int" | "float") – state data type + (default: "float").
          • +
          • shape (int | iter[int]) – state shape + (required).
          • +
          • num_values (int > 0) – number of discrete state values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
          • +
          + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
            +
          • type ("bool" | "int" | "float") – action data type + (required).
          • +
          • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
          • +
          • num_values (int > 0) – number of discrete action values + (required for type "int").
          • +
          • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
          • +
          + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + action_values (dict[value]): Constant value per action + (default: false for binary boolean actions, + 0 for discrete integer actions, 0.0 for continuous actions). + + config (specification): Additional configuration options: +
            +
          • name (string) – Agent name, used e.g. for TensorFlow scopes + (default: "agent"). +
          • device (string) – Device name + (default: TensorFlow default). +
          • seed (int) – Random seed to set for Python, NumPy (both set + globally!) and TensorFlow, environment seed may have to be set separately for fully + deterministic execution + (default: none).
          • +
          • buffer_observe (false | "episode" | int > 0) – Number of + timesteps within an episode to buffer before calling the internal observe function, to + reduce calls to TensorFlow for improved performance + (default: configuration-specific maximum + number which can be buffered without affecting performance).
          • +
          • always_apply_exploration (bool) – Whether to always apply + exploration, also for independent `act()` calls (final value in case of schedule) + (default: false).
          • +
          • always_apply_variable_noise (bool) – Whether to always apply + variable noise, also for independent `act()` calls (final value in case of schedule) + (default: false).
          • +
          • enable_int_action_masking (bool) – Whether int action options + can be masked via an optional "[ACTION-NAME]_mask" state input + (default: true).
          • +
          • create_tf_assertions (bool) – Whether to create internal + TensorFlow assertion operations + (default: true).
          • +
          + recorder (path | specification): Traces recordings directory, or recorder configuration with + the following attributes (see + [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) + for example application) + (default: no recorder): +
            +
          • directory (path) – recorder directory + (required).
          • +
          • frequency (int > 0) – how frequently in episodes to record + traces (default: every episode).
          • +
          • start (int >= 0) – how many episodes to skip before starting to + record traces (default: 0).
          • +
          • max-traces (int > 0) – maximum number of traces to keep + (default: all).
          • + """ + + def __init__( + # Environment + self, states, actions, max_episode_timesteps=None, + # Agent + action_values=None, + # Config, recorder + config=None, recorder=None + ): + if not hasattr(self, 'spec'): + self.spec = OrderedDict( + agent='constant', + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + action_values=action_values, + config=config, recorder=recorder + ) + + super().__init__( + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=1, config=config, recorder=recorder + ) + + self.model = ConstantModel( + states=self.states_spec, actions=self.actions_spec, + parallel_interactions=self.parallel_interactions, + config=self.config, summarizer=None, tracking=None, + action_values=action_values + ) diff --git a/tensorforce/agents/constant_agent.py b/tensorforce/agents/constant_agent.py deleted file mode 100644 index c01391e2f..000000000 --- a/tensorforce/agents/constant_agent.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import Agent -from tensorforce.models.constant_model import ConstantModel - - -class ConstantAgent(Agent): - """ - Agent returning constant action values. - """ - - def __init__( - self, - states, - actions, - action_values, - batched_observe=True, - batching_capacity=1000, - scope='constant', - device=None, - saver=None, - summarizer=None, - distributed=None - ): - """ - Initializes the constant agent. - - Args: - action_values (value, or dict of values): Action values returned by the agent - (required). - scope (str): TensorFlow scope (default: name of agent). - device: TensorFlow device (default: none) - saver (spec): Saver specification, with the following attributes (default: none): - - directory: model directory. - - file: model filename (optional). - - seconds or steps: save frequency (default: 600 seconds). - - load: specifies whether model is loaded, if existent (default: true). - - basename: optional file basename (default: 'model.ckpt'). - summarizer (spec): Summarizer specification, with the following attributes (default: - none): - - directory: summaries directory. - - seconds or steps: summarize frequency (default: 120 seconds). - - labels: list of summary labels to record (default: []). - - meta_param_recorder_class: ???. - distributed (spec): Distributed specification, with the following attributes (default: - none): - - cluster_spec: TensorFlow ClusterSpec object (required). - - task_index: integer (required). - - parameter_server: specifies whether this instance is a parameter server (default: - false). - - protocol: communication protocol (default: none, i.e. 'grpc'). - - config: TensorFlow ConfigProto object (default: none). - - replica_model: internal. - """ - - self.scope = scope - self.device = device - self.saver = saver - self.summarizer = summarizer - self.distributed = distributed - self.batching_capacity = batching_capacity - self.action_values = action_values - - super(ConstantAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity - ) - - def initialize_model(self): - return ConstantModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - action_values=self.action_values - ) diff --git a/tensorforce/agents/ddpg_agent.py b/tensorforce/agents/ddpg_agent.py deleted file mode 100755 index 25159374f..000000000 --- a/tensorforce/agents/ddpg_agent.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import DPGTargetModel - - -class DDPGAgent(LearningAgent): - """ - Deep Deterministic Policy Gradient agent - ([Lillicrap et al., 2015](https://arxiv.org/abs/1509.02971)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='ddpg', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - critic_network=None, - critic_optimizer=None, - target_sync_frequency=10000, - target_update_weight=1.0 - ): - """ - Initializes the DDPG agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'timesteps' if given (default: 'timesteps'). - - batch_size: integer (default: 10). - - frequency: integer (default: batch_size). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='replay', include_next_states=true, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - critic_network (spec): Critic network specification, usually a list of layer specifications, - see core.networks module for more information (default: network). - critic_optimizer (spec): Critic optimizer specification, see core.optimizers module for - more information (default: {type='adam', learning_rate=1e-3}). - target_sync_frequency (int): Target network sync frequency (default: 10000). - target_update_weight (float): Target network update weight (default: 1.0). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='timesteps', - batch_size=10 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'timesteps' - else: - update_mode['unit'] = 'timesteps' - - # Memory - if memory is None: - # Assumed episode length of 1000 timesteps. - memory = dict( - type='replay', - include_next_states=True, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - if critic_network is None: - critic_network = network - - if critic_optimizer is None: - critic_optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.critic_network = critic_network - self.critic_optimizer = critic_optimizer - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - - super(DDPGAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return DPGTargetModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - critic_network=self.critic_network, - critic_optimizer=self.critic_optimizer, - target_sync_frequency=self.target_sync_frequency, - target_update_weight=self.target_update_weight - ) diff --git a/tensorforce/agents/double_dqn.py b/tensorforce/agents/double_dqn.py new file mode 100644 index 000000000..fd846460a --- /dev/null +++ b/tensorforce/agents/double_dqn.py @@ -0,0 +1,232 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class DoubleDQN(TensorforceAgent): + """ + [Double DQN](https://arxiv.org/abs/1509.06461) agent (specification key: `double_dqn` or + `ddqn`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + memory (int > 0): Replay memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (required). + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: 0.25 * batch_size). + start_updating (parameter, int >= batch_size): + Number of timesteps before first update + (default: none). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + huber_loss (parameter, float > 0.0): Huber loss + threshold + (default: no huber loss). + + horizon (parameter, int >= 1): n-step DQN, horizon + of discounted-sum reward estimation before target network estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + target_update_weight (parameter, 0.0 < float <= 1.0): + Target network update weight + (default: 1.0). + target_sync_frequency (parameter, int >= 1): + Interval between target network updates + (default: every update). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

            + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, memory, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', + # Optimization + update_frequency=0.25, start_updating=None, learning_rate=1e-3, huber_loss=None, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + predict_terminal_values=False, + # Target network + target_update_weight=1.0, target_sync_frequency=1, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='DoubleDQN', argument='estimate_terminal', + replacement='predict_terminal_values' + ) + + self.spec = OrderedDict( + agent='dqn', + states=states, actions=actions, memory=memory, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, + update_frequency=update_frequency, start_updating=start_updating, + learning_rate=learning_rate, huber_loss=huber_loss, + horizon=horizon, discount=discount, return_processing=return_processing, + predict_terminal_values=predict_terminal_values, + target_update_weight=target_update_weight, target_sync_frequency=target_sync_frequency, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_value_policy', network=network, state_value_mode='implicit' + ) + + memory = dict(type='replay', capacity=memory) + + update = dict( + unit='timesteps', batch_size=batch_size, frequency=update_frequency, + start=start_updating + ) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = dict(type='action_value', huber_loss=huber_loss) + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='late', + estimate_advantage=False, predict_action_values=True, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = policy + baseline_optimizer = dict( + type='synchronization', update_weight=target_update_weight, + sync_frequency=target_sync_frequency + ) + baseline_objective = None + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/dpg.py b/tensorforce/agents/dpg.py new file mode 100644 index 000000000..b9de278db --- /dev/null +++ b/tensorforce/agents/dpg.py @@ -0,0 +1,234 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class DeterministicPolicyGradient(TensorforceAgent): + """ + [Deterministic Policy Gradient](https://arxiv.org/abs/1509.02971) agent (specification key: + `dpg` or `ddpg`). Action space is required to consist of only a single float action. + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + memory (int > 0): Replay memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (required). + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: true). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + start_updating (parameter, int >= batch_size): + Number of timesteps before first update + (default: none). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + + horizon (parameter, int >= 1): Horizon of + discounted-sum reward estimation before critic estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + critic (specification): Critic network configuration, see the + [networks documentation](../modules/networks.html) + (default: none). + critic_optimizer (float > 0.0 | specification): Critic optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), a float instead specifies a + custom weight for the critic loss + (default: 1.0). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: 0.1 standard deviation). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

            + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, memory, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', use_beta_distribution=True, + # Optimization + update_frequency=1.0, start_updating=None, learning_rate=1e-3, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + predict_terminal_values=False, + # Critic + critic='auto', critic_optimizer=1.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.1, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='DPG', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'critic_network' in kwargs: + raise TensorforceError.deprecated( + name='DPG', argument='critic_network', replacement='critic' + ) + + self.spec = OrderedDict( + agent='dpg', + states=states, actions=actions, memory=memory, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, use_beta_distribution=use_beta_distribution, + update_frequency=update_frequency, start_updating=start_updating, + learning_rate=learning_rate, + horizon=horizon, discount=discount, return_processing=return_processing, + predict_terminal_values=predict_terminal_values, + critic=critic, critic_optimizer=critic_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=0.0, + use_beta_distribution=use_beta_distribution + ) + + memory = dict(type='replay', capacity=memory) + + update = dict( + unit='timesteps', batch_size=batch_size, frequency=update_frequency, + start=start_updating + ) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = 'deterministic_policy_gradient' + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='late', + estimate_advantage=False, predict_action_values=True, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = dict(type='parametrized_action_value', network=critic) + baseline_optimizer = critic_optimizer + baseline_objective = dict(type='action_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/dqfd_agent.py b/tensorforce/agents/dqfd_agent.py deleted file mode 100644 index 6ebbf4187..000000000 --- a/tensorforce/agents/dqfd_agent.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from six.moves import xrange - -from tensorforce.agents import LearningAgent -from tensorforce.models import QDemoModel - - -class DQFDAgent(LearningAgent): - """ - Deep Q-learning from demonstration agent - ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='dqfd', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - target_sync_frequency=10000, - target_update_weight=1.0, - huber_loss=None, - # first_update=10000, - # repeat_update=1 - expert_margin=0.5, - supervised_weight=0.1, - demo_memory_capacity=10000, - demo_sampling_ratio=0.2 - ): - """ - Initializes the DQFD agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'timesteps' if given (default: 'timesteps'). - - batch_size: integer (default: 32). - - frequency: integer (default: 4). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='replay', include_next_states=true, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - target_sync_frequency (int): Target network sync frequency (default: 10000). - target_update_weight (float): Target network update weight (default: 1.0). - huber_loss (float): Huber loss clipping (default: none). - expert_margin (float): Enforced supervised margin between expert action Q-value and - other Q-values (default: 0.5). - supervised_weight (float): Weight of supervised loss term (default: 0.1). - demo_memory_capacity (int): Capacity of expert demonstration memory (default: 10000). - demo_sampling_ratio (float): Runtime sampling ratio of expert data (default: 0.2). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='timesteps', - batch_size=32, - frequency=4 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'timesteps' - else: - update_mode['unit'] = 'timesteps' - - # Memory - if memory is None: - # Default capacity of 1000 batches - memory = dict( - type='replay', - include_next_states=True, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - self.double_q_model = True - self.huber_loss = huber_loss - self.expert_margin = expert_margin - self.supervised_weight = supervised_weight - - self.demo_memory_capacity = demo_memory_capacity - # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples - # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) - self.demo_batch_size = int(demo_sampling_ratio * update_mode['batch_size'] / (1.0 - demo_sampling_ratio)) - assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ - 'demo_batch_size is positive. (Calculated {} based on current' \ - ' parameters)'.format(self.demo_batch_size) - - # This is the demonstration memory that we will fill with observations before starting - # the main training loop - super(DQFDAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return QDemoModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - target_sync_frequency=self.target_sync_frequency, - target_update_weight=self.target_update_weight, - # DQFD always uses double dqn, which is a required key for a q-model. - double_q_model=True, - huber_loss=self.huber_loss, - expert_margin=self.expert_margin, - supervised_weight=self.supervised_weight, - demo_memory_capacity=self.demo_memory_capacity, - demo_batch_size=self.demo_batch_size - ) - - # This is handled by the model now - # def observe(self, reward, terminal): - # """ - # Adds observations, updates via sampling from memories according to update rate. - # DQFD samples from the online replay memory and the demo memory with - # the fractions controlled by a hyper parameter p called 'expert sampling ratio. - # - # Args: - # reward: - # terminal: - # """ - # super(DQFDAgent, self).observe(reward=reward, terminal=terminal) - # if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: - # for _ in xrange(self.repeat_update): - # self.model.demonstration_update() - - def import_demonstrations(self, demonstrations): - """ - Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, - set_demonstrations is more appropriate, which directly sets memory contents to an array an expects - a different layout. - - Args: - demonstrations: List of observation dicts - """ - if isinstance(demonstrations, dict): - if self.unique_state: - demonstrations['states'] = dict(state=demonstrations['states']) - if self.unique_action: - demonstrations['actions'] = dict(action=demonstrations['actions']) - - self.model.import_demo_experience(**demonstrations) - - else: - if self.unique_state: - states = dict(state=list()) - else: - states = {name: list() for name in demonstrations[0]['states']} - internals = {name: list() for name in demonstrations[0]['internals']} - if self.unique_action: - actions = dict(action=list()) - else: - actions = {name: list() for name in demonstrations[0]['actions']} - terminal = list() - reward = list() - - for demonstration in demonstrations: - if self.unique_state: - states['state'].append(demonstration['states']) - else: - for name, state in states.items(): - state.append(demonstration['states'][name]) - for name, internal in internals.items(): - internal.append(demonstration['internals'][name]) - if self.unique_action: - actions['action'].append(demonstration['actions']) - else: - for name, action in actions.items(): - action.append(demonstration['actions'][name]) - terminal.append(demonstration['terminal']) - reward.append(demonstration['reward']) - - self.model.import_demo_experience( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - def pretrain(self, steps): - """ - Computes pre-train updates. - - Args: - steps: Number of updates to execute. - """ - for _ in xrange(steps): - self.model.demo_update() diff --git a/tensorforce/agents/dqn.py b/tensorforce/agents/dqn.py new file mode 100644 index 000000000..44efeb38b --- /dev/null +++ b/tensorforce/agents/dqn.py @@ -0,0 +1,230 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class DeepQNetwork(TensorforceAgent): + """ + [Deep Q-Network](https://www.nature.com/articles/nature14236) agent (specification key: `dqn`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + memory (int > 0): Replay memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (required). + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: 0.25 * batch_size). + start_updating (parameter, int >= batch_size): + Number of timesteps before first update + (default: none). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + huber_loss (parameter, float > 0.0): Huber loss + threshold + (default: no huber loss). + + horizon (parameter, int >= 1): n-step DQN, horizon + of discounted-sum reward estimation before target network estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + target_update_weight (parameter, 0.0 < float <= 1.0): + Target network update weight + (default: 1.0). + target_sync_frequency (parameter, int >= 1): + Interval between target network updates + (default: every update). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

            + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, memory, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', + # Optimization + update_frequency=0.25, start_updating=None, learning_rate=1e-3, huber_loss=None, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + predict_terminal_values=False, + # Target network + target_update_weight=1.0, target_sync_frequency=1, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='DQN', argument='estimate_terminal', replacement='predict_terminal_values' + ) + + self.spec = OrderedDict( + agent='dqn', + states=states, actions=actions, memory=memory, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, + update_frequency=update_frequency, start_updating=start_updating, + learning_rate=learning_rate, huber_loss=huber_loss, + horizon=horizon, discount=discount, return_processing=return_processing, + predict_terminal_values=predict_terminal_values, + target_update_weight=target_update_weight, target_sync_frequency=target_sync_frequency, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_value_policy', network=network, state_value_mode='implicit' + ) + + memory = dict(type='replay', capacity=memory) + + update = dict( + unit='timesteps', batch_size=batch_size, frequency=update_frequency, + start=start_updating + ) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = dict(type='action_value', huber_loss=huber_loss) + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='late', + estimate_advantage=False, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = policy + baseline_optimizer = dict( + type='synchronization', update_weight=target_update_weight, + sync_frequency=target_sync_frequency + ) + baseline_objective = None + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/dqn_agent.py b/tensorforce/agents/dqn_agent.py deleted file mode 100755 index c0e924955..000000000 --- a/tensorforce/agents/dqn_agent.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import QModel - - -class DQNAgent(LearningAgent): - """ - Deep Q-Network agent ([Mnih et al., 2015](https://www.nature.com/articles/nature14236)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='dqn', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - target_sync_frequency=10000, - target_update_weight=1.0, - double_q_model=False, - huber_loss=None - # first_update=10000, - # repeat_update=1 - ): - """ - Initializes the DQN agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'timesteps' if given (default: 'timesteps'). - - batch_size: integer (default: 32). - - frequency: integer (default: 4). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='replay', include_next_states=true, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - target_sync_frequency (int): Target network sync frequency (default: 10000). - target_update_weight (float): Target network update weight (default: 1.0). - double_q_model (bool): Specifies whether double DQN mode is used (default: false). - huber_loss (float): Huber loss clipping (default: none). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='timesteps', - batch_size=32, - frequency=4 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'timesteps' - else: - update_mode['unit'] = 'timesteps' - - # Memory - if memory is None: - # Default capacity of 1000 batches - memory = dict( - type='replay', - include_next_states=True, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - self.double_q_model = double_q_model - self.huber_loss = huber_loss - - super(DQNAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return QModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - target_sync_frequency=self.target_sync_frequency, - target_update_weight=self.target_update_weight, - double_q_model=self.double_q_model, - huber_loss=self.huber_loss - ) diff --git a/tensorforce/agents/dqn_nstep_agent.py b/tensorforce/agents/dqn_nstep_agent.py deleted file mode 100644 index b0dab446c..000000000 --- a/tensorforce/agents/dqn_nstep_agent.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import QNstepModel - - -class DQNNstepAgent(LearningAgent): - """ - DQN n-step agent. - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='dqn-nstep', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - target_sync_frequency=10000, - target_update_weight=1.0, - double_q_model=False, - huber_loss=None - ): - """ - Initializes the DQN n-step agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'episodes' if given (default: 'episodes'). - - batch_size: integer (default: 10). - - frequency: integer (default: batch_size). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='latest', include_next_states=true, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - target_sync_frequency (int): Target network sync frequency (default: 10000). - target_update_weight (float): Target network update weight (default: 1.0). - double_q_model (bool): Specifies whether double DQN mode is used (default: false). - huber_loss (float): Huber loss clipping (default: none). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='episodes', - batch_size=10 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'episodes' - else: - update_mode['unit'] = 'episodes' - - # Memory - if memory is None: - # Assumed episode length of 1000 timesteps. - memory = dict( - type='latest', - include_next_states=True, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - self.double_q_model = double_q_model - self.huber_loss = huber_loss - - super(DQNNstepAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return QNstepModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - target_sync_frequency=self.target_sync_frequency, - target_update_weight=self.target_update_weight, - double_q_model=self.double_q_model, - huber_loss=self.huber_loss - ) diff --git a/tensorforce/agents/dueling_dqn.py b/tensorforce/agents/dueling_dqn.py new file mode 100644 index 000000000..b18d55fb5 --- /dev/null +++ b/tensorforce/agents/dueling_dqn.py @@ -0,0 +1,233 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class DuelingDQN(TensorforceAgent): + """ + [Dueling DQN](https://arxiv.org/abs/1511.06581) agent (specification key: `dueling_dqn`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + memory (int > 0): Replay memory capacity, has to fit at least maximum batch_size + maximum + network/estimator horizon + 1 timesteps + (required). + batch_size (parameter, int > 0): Number of + timesteps per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: 0.25 * batch_size). + start_updating (parameter, int >= batch_size): + Number of timesteps before first update + (default: none). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + huber_loss (parameter, float > 0.0): Huber loss + threshold + (default: no huber loss). + + horizon (parameter, int >= 1): n-step DQN, horizon + of discounted-sum reward estimation before target network estimate + (default: 1). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + target_update_weight (parameter, 0.0 < float <= 1.0): + Target network update weight + (default: 1.0). + target_sync_frequency (parameter, int >= 1): + Interval between target network updates + (default: every update). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

            + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, memory, batch_size, + # Environment + max_episode_timesteps=None, + # Network + network='auto', + # Optimization + update_frequency=0.25, start_updating=None, learning_rate=1e-3, huber_loss=None, + # Reward estimation + horizon=1, discount=0.99, reward_processing=None, return_processing=None, + predict_terminal_values=False, + # Target network + target_update_weight=1.0, target_sync_frequency=1, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='DuelingDQN', argument='estimate_terminal', + replacement='predict_terminal_values' + ) + + self.spec = OrderedDict( + agent='dueling_dqn', + states=states, actions=actions, memory=memory, batch_size=batch_size, + max_episode_timesteps=max_episode_timesteps, + network=network, + update_frequency=update_frequency, start_updating=start_updating, + learning_rate=learning_rate, huber_loss=huber_loss, + horizon=horizon, discount=discount, return_processing=return_processing, + predict_terminal_values=predict_terminal_values, + target_update_weight=target_update_weight, target_sync_frequency=target_sync_frequency, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict(type='parametrized_value_policy', network=network) + + memory = dict(type='replay', capacity=memory) + + update = dict( + unit='timesteps', batch_size=batch_size, frequency=update_frequency, + start=start_updating + ) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = dict(type='action_value', huber_loss=huber_loss) + + reward_estimation = dict( + horizon=horizon, discount=discount, predict_horizon_values='late', + estimate_advantage=False, predict_action_values=True, + reward_processing=reward_processing, return_processing=return_processing, + predict_terminal_values=predict_terminal_values + ) + + baseline = policy + baseline_optimizer = dict( + type='synchronization', update_weight=target_update_weight, + sync_frequency=target_sync_frequency + ) + baseline_objective = None + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) + + if any(spec.type != 'int' for spec in self.actions_spec.values()): + raise TensorforceError.value( + name='DuelingDQN', argument='actions', value=actions, hint='contains non-int action' + ) diff --git a/tensorforce/agents/learning_agent.py b/tensorforce/agents/learning_agent.py deleted file mode 100644 index 4fc67a488..000000000 --- a/tensorforce/agents/learning_agent.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import inspect - -from tensorforce import TensorForceError -from tensorforce.agents.agent import Agent -from tensorforce.meta_parameter_recorder import MetaParameterRecorder - - -class LearningAgent(Agent): - """ - Base class for learning agents, using as model a subclass of MemoryModel and DistributionModel. - """ - - def __init__( - self, - states, - actions, - network, - update_mode, - memory, - optimizer, - batched_observe=True, - batching_capacity=1000, - scope='learning-agent', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - discount=0.99, - distributions=None, - entropy_regularization=None - ): - """ - Initializes the learning agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes - (required): - - unit: one of 'timesteps', 'episodes', 'sequences' (required). - - batch_size: integer (required). - - frequency: integer (default: batch_size). - - length: integer (optional if unit == 'sequences', default: 8). - memory (spec): Memory specification, see core.memories module for more information - (required). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (required). - network (spec): Network specification, usually a list of layer specifications, see - core.networks module for more information (required). - scope (str): TensorFlow scope (default: name of agent). - device: TensorFlow device (default: none) - saver (spec): Saver specification, with the following attributes (default: none): - - directory: model directory. - - file: model filename (optional). - - seconds or steps: save frequency (default: 600 seconds). - - load: specifies whether model is loaded, if existent (default: true). - - basename: optional file basename (default: 'model.ckpt'). - summarizer (spec): Summarizer specification, with the following attributes (default: - none): - - directory: summaries directory. - - seconds or steps: summarize frequency (default: 120 seconds). - - labels: list of summary labels to record (default: []). - - meta_param_recorder_class: ???. - distributed (spec): Distributed specification, with the following attributes (default: - none): - - cluster_spec: TensorFlow ClusterSpec object (required). - - task_index: integer (required). - - parameter_server: specifies whether this instance is a parameter server (default: - false). - - protocol: communication protocol (default: none, i.e. 'grpc'). - - config: TensorFlow ConfigProto object (default: none). - - replica_model: internal. - variable_noise (float): Standard deviation of variable noise (default: none). - states_preprocessing (spec, or dict of specs): States preprocessing specification, see - core.preprocessors module for more information (default: none) - actions_exploration (spec, or dict of specs): Actions exploration specification, see - core.explorations module for more information (default: none). - reward_preprocessing (spec): Reward preprocessing specification, see core.preprocessors - module for more information (default: none). - discount (float): Discount factor for future rewards (default: 0.99). - distributions (spec / dict of specs): Distributions specifications, see - core.distributions module for more information (default: none). - entropy_regularization (float): Entropy regularization weight (default: none). - """ - - self.scope = scope - self.device = device - self.saver = saver - self.summarizer = summarizer - self.distributed = distributed - self.variable_noise = variable_noise - self.states_preprocessing = states_preprocessing - self.actions_exploration = actions_exploration - self.reward_preprocessing = reward_preprocessing - self.update_mode = update_mode - self.memory = memory - self.optimizer = optimizer - self.discount = discount - self.network = network - self.distributions = distributions - self.entropy_regularization = entropy_regularization - - # TensorFlow summarizer & Configuration Meta Parameter Recorder options - if self.summarizer is None: - summary_labels = set() - else: - summary_labels = set(self.summarizer.get('labels', ())) - - self.meta_param_recorder = None - - # if 'configuration' in self.summary_labels or 'print_configuration' in self.summary_labels: - if any(k in summary_labels for k in ['configuration', 'print_configuration']): - self.meta_param_recorder = MetaParameterRecorder(inspect.currentframe()) - if 'meta_dict' in self.summarizer: - # Custom Meta Dictionary passed - self.meta_param_recorder.merge_custom(self.summarizer['meta_dict']) - if 'configuration' in summary_labels: - # Setup for TensorBoard population - self.summarizer['meta_param_recorder_class'] = self.meta_param_recorder - if 'print_configuration' in summary_labels: - # Print to STDOUT (TODO: optimize output) - self.meta_param_recorder.text_output(format_type=1) - - super(LearningAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity - ) - - def import_experience(self, experiences): - """ - Imports experiences. - - Args: - experiences: - """ - if isinstance(experiences, dict): - if self.unique_state: - experiences['states'] = dict(state=experiences['states']) - if self.unique_action: - experiences['actions'] = dict(action=experiences['actions']) - - self.model.import_experience(**experiences) - - else: - if self.unique_state: - states = dict(state=list()) - else: - states = {name: list() for name in experiences[0]['states']} - internals = [list() for _ in experiences[0]['internals']] - if self.unique_action: - actions = dict(action=list()) - else: - actions = {name: list() for name in experiences[0]['actions']} - terminal = list() - reward = list() - - for experience in experiences: - if self.unique_state: - states['state'].append(experience['states']) - else: - for name, state in states.items(): - state.append(experience['states'][name]) - for n, internal in enumerate(internals): - internal.append(experience['internals'][n]) - if self.unique_action: - actions['action'].append(experience['actions']) - else: - for name, action in actions.items(): - action.append(experience['actions'][name]) - terminal.append(experience['terminal']) - reward.append(experience['reward']) - - self.model.import_experience( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) diff --git a/tensorforce/agents/naf_agent.py b/tensorforce/agents/naf_agent.py deleted file mode 100755 index 973168c93..000000000 --- a/tensorforce/agents/naf_agent.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import QNAFModel - - -class NAFAgent(LearningAgent): - """ - Normalized Advantage Function agent ([Gu et al., 2016](https://arxiv.org/abs/1603.00748)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='naf', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - target_sync_frequency=10000, - target_update_weight=1.0, - double_q_model=False, - huber_loss=None - # first_update=10000, - # repeat_update=1 - ): - """ - Initializes the NAF agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'timesteps' if given (default: 'timesteps'). - - batch_size: integer (default: 32). - - frequency: integer (default: 4). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='replay', include_next_states=true, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - target_sync_frequency (int): Target network sync frequency (default: 10000). - target_update_weight (float): Target network update weight (default: 1.0). - double_q_model (bool): Specifies whether double DQN mode is used (default: false). - huber_loss (float): Huber loss clipping (default: none). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='timesteps', - batch_size=32, - frequency=4 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'timesteps' - else: - update_mode['unit'] = 'timesteps' - - # Memory - if memory is None: - # Default capacity of 1000 batches - memory = dict( - type='replay', - include_next_states=True, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - self.double_q_model = double_q_model - self.huber_loss = huber_loss - - super(NAFAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return QNAFModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - target_sync_frequency=self.target_sync_frequency, - target_update_weight=self.target_update_weight, - double_q_model=self.double_q_model, - huber_loss=self.huber_loss - ) diff --git a/tensorforce/agents/ppo.py b/tensorforce/agents/ppo.py new file mode 100644 index 000000000..e2891cf3a --- /dev/null +++ b/tensorforce/agents/ppo.py @@ -0,0 +1,276 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class ProximalPolicyOptimization(TensorforceAgent): + """ + [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347) agent (specification key: + `ppo`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + batch_size (parameter, int > 0): Number of episodes + per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + + memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + 1 episodes + (default: minimum capacity, usually does not + need to be changed). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + multi_step (parameter, int >= 1): Number of + optimization steps, update_frequency * multi_step should be at least 1 if relative + subsampling_fraction + (default: 10). + subsampling_fraction (parameter, int > 0 | 0.0 < float <= 1.0): + Absolute/relative fraction of batch timesteps to subsample, + update_frequency * multi_step should be at least 1 if relative subsampling_fraction + (default: 0.33). + + likelihood_ratio_clipping (parameter, float > 0.0): + Likelihood-ratio clipping threshold + (default: 0.25). + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + advantage_processing (specification): Advantage processing as layer or list of layers, see + the [preprocessing documentation](../modules/preprocessing.html) + (default: no advantage processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + baseline (specification): Baseline network configuration, see the + [networks documentation](../modules/networks.html), + main policy will be used as baseline if none + (default: none). + baseline_optimizer (float > 0.0 | specification): Baseline optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), main optimizer will be used for + baseline if none, a float implies none and specifies a custom weight for the baseline + loss + (default: none). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

            + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, max_episode_timesteps, batch_size, + # Network + network='auto', use_beta_distribution=False, + # Memory + memory='minimum', + # Optimization + update_frequency=1.0, learning_rate=1e-3, multi_step=10, subsampling_fraction=0.33, + # Reward estimation + likelihood_ratio_clipping=0.25, discount=0.99, reward_processing=None, + return_processing=None, advantage_processing=None, predict_terminal_values=False, + # Baseline + baseline=None, baseline_optimizer=None, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'optimization_steps' in kwargs: + raise TensorforceError.deprecated( + name='PPO', argument='optimization_steps', replacement='multi_step' + ) + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='PPO', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'critic_network' in kwargs: + raise TensorforceError.deprecated( + name='PPO', argument='critic_network', replacement='baseline' + ) + if 'baseline_network' in kwargs: + raise TensorforceError.deprecated( + name='PPO', argument='baseline_network', replacement='baseline' + ) + if 'critic_optimizer' in kwargs: + raise TensorforceError.deprecated( + name='PPO', argument='critic_optimizer', replacement='baseline_optimizer' + ) + + self.spec = OrderedDict( + agent='ppo', + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + batch_size=batch_size, + network=network, use_beta_distribution=use_beta_distribution, + memory=memory, + update_frequency=update_frequency, learning_rate=learning_rate, + multi_step=multi_step, subsampling_fraction=subsampling_fraction, + likelihood_ratio_clipping=likelihood_ratio_clipping, discount=discount, + return_processing=return_processing, advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values, + baseline=baseline, baseline_optimizer=baseline_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=1.0, + use_beta_distribution=use_beta_distribution + ) + + if memory == 'minimum': + memory = dict(type='recent') + else: + memory = dict(type='recent', capacity=memory) + + update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency) + + optimizer = dict( + optimizer='adam', learning_rate=learning_rate, multi_step=multi_step, + subsampling_fraction=subsampling_fraction + ) + objective = dict( + type='policy_gradient', importance_sampling=True, + clipping_value=likelihood_ratio_clipping + ) + + if baseline is None: + assert not predict_terminal_values + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values=False, + estimate_advantage=False, reward_processing=reward_processing, + return_processing=return_processing + ) + assert baseline_optimizer is None + baseline_objective = None + + else: + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values='early', + estimate_advantage=True, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values + ) + baseline = dict(type='parametrized_state_value', network=baseline) + assert baseline_optimizer is not None + baseline_objective = dict(type='state_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/ppo_agent.py b/tensorforce/agents/ppo_agent.py deleted file mode 100644 index 2912eb006..000000000 --- a/tensorforce/agents/ppo_agent.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import PGProbRatioModel - - -class PPOAgent(LearningAgent): - """ - Proximal Policy Optimization agent ([Schulman et al., 2017](https://arxiv.org/abs/1707.06347)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='ppo', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - baseline_mode=None, - baseline=None, - baseline_optimizer=None, - gae_lambda=None, - likelihood_ratio_clipping=0.2, - step_optimizer=None, - subsampling_fraction=0.1, - optimization_steps=50 - ): - """ - Initializes the PPO agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'episodes' if given (default: 'episodes'). - - batch_size: integer (default: 10). - - frequency: integer (default: batch_size). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='latest', include_next_states=false, capacity=1000*batch_size}). - optimizer (spec): PPO agent implicitly defines a multi-step subsampling optimizer. - baseline_mode (str): One of 'states', 'network' (default: none). - baseline (spec): Baseline specification, see core.baselines module for more information - (default: none). - baseline_optimizer (spec): Baseline optimizer specification, see core.optimizers module - for more information (default: none). - gae_lambda (float): Lambda factor for generalized advantage estimation (default: none). - likelihood_ratio_clipping (float): Likelihood ratio clipping for policy gradient - (default: 0.2). - step_optimizer (spec): Step optimizer specification of implicit multi-step subsampling - optimizer, see core.optimizers module for more information (default: {type='adam', - learning_rate=1e-3}). - subsampling_fraction (float): Subsampling fraction of implicit subsampling optimizer - (default: 0.1). - optimization_steps (int): Number of optimization steps for implicit multi-step - optimizer (default: 50). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='episodes', - batch_size=10 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'episodes' - else: - update_mode['unit'] = 'episodes' - - # Memory - if memory is None: - # Assumed episode length of 1000 timesteps. - memory = dict( - type='latest', - include_next_states=False, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert not memory['include_next_states'] - - # Optimizer - if step_optimizer is None: - step_optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - optimizer = dict( - type='multi_step', - optimizer=dict( - type='subsampling_step', - optimizer=step_optimizer, - fraction=subsampling_fraction - ), - num_steps=optimization_steps - ) - - self.baseline_mode = baseline_mode - self.baseline = baseline - self.baseline_optimizer = baseline_optimizer - self.gae_lambda = gae_lambda - self.likelihood_ratio_clipping = likelihood_ratio_clipping - - super(PPOAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return PGProbRatioModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - baseline_mode=self.baseline_mode, - baseline=self.baseline, - baseline_optimizer=self.baseline_optimizer, - gae_lambda=self.gae_lambda, - likelihood_ratio_clipping=self.likelihood_ratio_clipping - ) diff --git a/tensorforce/agents/random.py b/tensorforce/agents/random.py new file mode 100644 index 000000000..e2143e985 --- /dev/null +++ b/tensorforce/agents/random.py @@ -0,0 +1,126 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce.agents import Agent +from tensorforce.core.models import RandomModel + + +class RandomAgent(Agent): + """ + Agent returning random action values (specification key: `random`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
              +
            • type ("bool" | "int" | "float") – state data type + (default: "float").
            • +
            • shape (int | iter[int]) – state shape + (required).
            • +
            • num_values (int > 0) – number of discrete state values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
            • +
            + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
              +
            • type ("bool" | "int" | "float") – action data type + (required).
            • +
            • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
            • +
            • num_values (int > 0) – number of discrete action values + (required for type "int").
            • +
            • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
            • +
            + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + config (specification): Additional configuration options: +
              +
            • name (string) – Agent name, used e.g. for TensorFlow scopes + (default: "agent"). +
            • device (string) – Device name + (default: TensorFlow default). +
            • seed (int) – Random seed to set for Python, NumPy (both set + globally!) and TensorFlow, environment seed may have to be set separately for fully + deterministic execution + (default: none).
            • +
            • buffer_observe (false | "episode" | int > 0) – Number of + timesteps within an episode to buffer before calling the internal observe function, to + reduce calls to TensorFlow for improved performance + (default: configuration-specific maximum + number which can be buffered without affecting performance).
            • +
            • always_apply_exploration (bool) – Whether to always apply + exploration, also for independent `act()` calls (final value in case of schedule) + (default: false).
            • +
            • always_apply_variable_noise (bool) – Whether to always apply + variable noise, also for independent `act()` calls (final value in case of schedule) + (default: false).
            • +
            • enable_int_action_masking (bool) – Whether int action options + can be masked via an optional "[ACTION-NAME]_mask" state input + (default: true).
            • +
            • create_tf_assertions (bool) – Whether to create internal + TensorFlow assertion operations + (default: true).
            • +
            + recorder (path | specification): Traces recordings directory, or recorder configuration with + the following attributes (see + [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) + for example application) + (default: no recorder): +
              +
            • directory (path) – recorder directory + (required).
            • +
            • frequency (int > 0) – how frequently in episodes to record + traces (default: every episode).
            • +
            • start (int >= 0) – how many episodes to skip before starting to + record traces (default: 0).
            • +
            • max-traces (int > 0) – maximum number of traces to keep + (default: all).
            • + """ + + def __init__( + # Environment + self, states, actions, max_episode_timesteps=None, + # Config, recorder + config=None, recorder=None + ): + if not hasattr(self, 'spec'): + self.spec = OrderedDict( + agent='random', + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + config=config, recorder=recorder + ) + + super().__init__( + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=1, config=config, recorder=recorder + ) + + self.model = RandomModel( + states=self.states_spec, actions=self.actions_spec, + parallel_interactions=self.parallel_interactions, + config=self.config, summarizer=None, tracking=None + ) diff --git a/tensorforce/agents/random_agent.py b/tensorforce/agents/random_agent.py deleted file mode 100644 index 0a3f958b1..000000000 --- a/tensorforce/agents/random_agent.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import Agent -from tensorforce.models.random_model import RandomModel - - -class RandomAgent(Agent): - """ - Agent returning random action values. - """ - - def __init__( - self, - states, - actions, - batched_observe=True, - batching_capacity=1000, - scope='random', - device=None, - saver=None, - summarizer=None, - distributed=None, - ): - """ - Initializes the random agent. - - Args: - scope (str): TensorFlow scope (default: name of agent). - device: TensorFlow device (default: none) - saver (spec): Saver specification, with the following attributes (default: none): - - directory: model directory. - - file: model filename (optional). - - seconds or steps: save frequency (default: 600 seconds). - - load: specifies whether model is loaded, if existent (default: true). - - basename: optional file basename (default: 'model.ckpt'). - summarizer (spec): Summarizer specification, with the following attributes (default: - none): - - directory: summaries directory. - - seconds or steps: summarize frequency (default: 120 seconds). - - labels: list of summary labels to record (default: []). - - meta_param_recorder_class: ???. - distributed (spec): Distributed specification, with the following attributes (default: - none): - - cluster_spec: TensorFlow ClusterSpec object (required). - - task_index: integer (required). - - parameter_server: specifies whether this instance is a parameter server (default: - false). - - protocol: communication protocol (default: none, i.e. 'grpc'). - - config: TensorFlow ConfigProto object (default: none). - - replica_model: internal. - """ - - self.scope = scope - self.device = device - self.saver = saver - self.summarizer = summarizer - self.distributed = distributed - - super(RandomAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity - ) - - def initialize_model(self): - return RandomModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity - ) diff --git a/tensorforce/agents/recorder.py b/tensorforce/agents/recorder.py new file mode 100644 index 000000000..eb255f8fa --- /dev/null +++ b/tensorforce/agents/recorder.py @@ -0,0 +1,575 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from collections import OrderedDict + +import numpy as np + +from tensorforce import TensorforceError, util +from tensorforce.core import ArrayDict, ListDict, TensorSpec, TensorsSpec + + +class Recorder(object): + """ + Recorder wrapper (specification key: `recorder`). + + Args: + fn_act (callable[states -> actions]): Act-function mapping states to actions which is + supposed to be recorded. + """ + + def __init__( + self, fn_act, states, actions, max_episode_timesteps=None, parallel_interactions=1, + recorder=None + ): + self.is_initialized = False + + # fn_act=None means Agent + if fn_act is None: + from tensorforce import Agent + assert isinstance(self, Agent) + self._is_agent = True + else: + self._is_agent = False + self.fn_act = fn_act + + # States/actions, plus single state/action flag + if 'type' in states or 'shape' in states: + self.states_spec = TensorsSpec(singleton=states) + else: + self.states_spec = TensorsSpec(states) + if 'type' in actions or 'shape' in actions: + self.actions_spec = TensorsSpec(singleton=actions) + else: + self.actions_spec = TensorsSpec(actions) + + # Max episode timesteps + if max_episode_timesteps is None: + self.max_episode_timesteps = None + else: + self.max_episode_timesteps = int(max_episode_timesteps) + + # Parallel interactions + if isinstance(parallel_interactions, int): + if parallel_interactions <= 0: + raise TensorforceError.value( + name='Agent', argument='parallel_interactions', value=parallel_interactions, + hint='<= 0' + ) + self.parallel_interactions = parallel_interactions + else: + raise TensorforceError.type( + name='Agent', argument='parallel_interactions', dtype=type(parallel_interactions) + ) + + # Other specifications + self.internals_spec = TensorsSpec() + self.terminal_spec = TensorSpec(type=int, shape=(), num_values=3) + self.reward_spec = TensorSpec(type=float, shape=()) + self.parallel_spec = TensorSpec(type=int, shape=(), num_values=self.parallel_interactions) + + # Recorder + if isinstance(recorder, str): + recorder = dict(directory=recorder) + if recorder is None: + pass + elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder): + raise TensorforceError.value( + name='Agent', argument='recorder values', value=list(recorder), + hint='not from {directory,frequency,max-traces,start}' + ) + self.recorder = recorder if recorder is None else dict(recorder) + + def initialize(self): + # Check whether already initialized + if self.is_initialized: + raise TensorforceError( + message="Agent is already initialized, possibly as part of Agent.create()." + ) + self.is_initialized = True + + # Act-observe timestep check + self.timestep_counter = np.zeros( + shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') + ) + self.timestep_completed = np.ones( + shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='bool') + ) + + # Recorder buffers if required + if self.recorder is not None: + self.num_episodes = 0 + + self.buffers = ListDict() + self.buffers['terminal'] = [list() for _ in range(self.parallel_interactions)] + self.buffers['reward'] = [list() for _ in range(self.parallel_interactions)] + + def function(spec): + return [list() for _ in range(self.parallel_interactions)] + + self.buffers['states'] = self.states_spec.fmap(function=function, cls=ListDict) + self.buffers['actions'] = self.actions_spec.fmap(function=function, cls=ListDict) + + function = (lambda x: list()) + + self.recorded = ListDict() + self.recorded['states'] = self.states_spec.fmap(function=function, cls=ListDict) + self.recorded['actions'] = self.actions_spec.fmap(function=function, cls=ListDict) + self.recorded['terminal'] = list() + self.recorded['reward'] = list() + + def close(self): + pass + + def reset(self): + # Reset timestep check + self.timestep_counter[:] = 0 + self.timestep_completed[:] = True + + # Reset buffers + if self.recorder is not None: + for buffer in self.buffers.values(): + for x in buffer: + x.clear() + if self.recorder is not None: + for x in self.recorded.values(): + x.clear() + + def initial_internals(self): + return OrderedDict() + + def act( + self, states, internals=None, parallel=0, independent=False, deterministic=True, **kwargs + ): + # Independent and internals + is_internals_none = (internals is None) + if independent: + if parallel != 0: + raise TensorforceError.invalid( + name='Agent.act', argument='parallel', condition='independent is true' + ) + if is_internals_none and len(self.internals_spec) > 0: + raise TensorforceError.required( + name='Agent.act', argument='internals', condition='independent is true' + ) + else: + if not is_internals_none: + raise TensorforceError.invalid( + name='Agent.act', argument='internals', condition='independent is false' + ) + + # Process states input and infer batching structure + states, batched, num_parallel, is_iter_of_dicts = self._process_states_input( + states=states, function_name='Agent.act' + ) + + if independent: + # Independent mode: handle internals argument + if is_internals_none: + # Default input internals=None + pass + + elif is_iter_of_dicts or isinstance(internals, (tuple, list)): + # Input structure iter[dict[internal]] + if not isinstance(internals, (tuple, list)): + raise TensorforceError.type( + name='Agent.act', argument='internals', dtype=type(internals), + hint='is not tuple/list' + ) + internals = [ArrayDict(internal) for internal in internals] + internals = internals[0].fmap( + function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:] + ) + + else: + # Input structure dict[iter[internal]] + if not isinstance(internals, dict): + raise TensorforceError.type( + name='Agent.act', argument='internals', dtype=type(internals), + hint='is not dict' + ) + internals = ArrayDict(internals) + + if not independent or not is_internals_none: + # Expand inputs if not batched + if not batched: + internals = internals.fmap(function=(lambda x: np.expand_dims(x, axis=0))) + + # Check number of inputs + for name, internal in internals.items(): + if internal.shape[0] != num_parallel: + raise TensorforceError.value( + name='Agent.act', argument='len(internals[{}])'.format(name), + value=internal.shape[0], hint='!= len(states)' + ) + + else: + # Non-independent mode: handle parallel input + if batched: + # Batched input + parallel = np.asarray(parallel) + + elif parallel == 0: + # Default input parallel=0 + if batched: + assert num_parallel == self.parallel_interactions + parallel = np.asarray(list(range(num_parallel))) + else: + parallel = np.asarray([parallel]) + + else: + # Expand input if not batched + parallel = np.asarray([parallel]) + + # Check number of inputs + if parallel.shape[0] != num_parallel: + raise TensorforceError.value( + name='Agent.act', argument='len(parallel)', value=len(parallel), + hint='!= len(states)' + ) + + # If not independent, check whether previous timesteps were completed + if not independent: + if not self.timestep_completed[parallel].all(): + raise TensorforceError( + message="Calling agent.act must be preceded by agent.observe for training, or " + "agent.act argument 'independent' must be passed as True." + ) + self.timestep_completed[parallel] = False + + # Buffer inputs for recording + if self.recorder is not None and not independent and \ + self.num_episodes >= self.recorder.get('start', 0): + for n in range(num_parallel): + for name in self.states_spec: + self.buffers['states'][name][parallel[n]].append(states[name][n]) + + # fn_act() + if self._is_agent: + actions, internals = self.fn_act( + states=states, internals=internals, parallel=parallel, independent=independent, + deterministic=deterministic, is_internals_none=is_internals_none, + num_parallel=num_parallel + ) + else: + if batched: + assert False + else: + states = states.fmap(function=(lambda x: x[0].item() if x.shape == (1,) else x[0])) + actions = self.fn_act(states.to_kwargs()) + if self.actions_spec.is_singleton(): + actions = ArrayDict(singleton=np.asarray([actions])) + else: + actions = ArrayDict(actions) + actions = actions.fmap(function=(lambda x: np.asarray([x]))) + + # Buffer outputs for recording + if self.recorder is not None and not independent and \ + self.num_episodes >= self.recorder.get('start', 0): + for n in range(num_parallel): + for name in self.actions_spec: + self.buffers['actions'][name][parallel[n]].append(actions[name][n]) + + # Unbatch actions + if batched: + # If inputs were batched, turn dict of lists into list of dicts + function = (lambda x: x.item() if x.shape == () else x) + # TODO: recursive + if self.actions_spec.is_singleton(): + actions = actions.singleton() + if is_iter_of_dicts: + actions = [function(actions[n]) for n in range(num_parallel)] + else: + if is_iter_of_dicts: + actions = [ + OrderedDict(((name, function(x[n])) for name, x in actions.items())) + for n in range(num_parallel) + ] + else: + actions = OrderedDict(actions.items()) + + if independent and not is_internals_none: + if is_iter_of_dicts: + # TODO: recursive + internals = [ + OrderedDict(((name, function(x[n])) for name, x in internals.items())) + for n in range(num_parallel) + ] + else: + internals = OrderedDict(internals.items()) + + else: + # If inputs were not batched, unbatch outputs + function = (lambda x: x.item() if x.shape == (1,) else x[0]) + if self.actions_spec.is_singleton(): + actions = function(actions.singleton()) + else: + actions = actions.fmap(function=function, cls=OrderedDict) + if independent and not is_internals_none: + internals = internals.fmap(function=function, cls=OrderedDict) + + if independent and not is_internals_none: + return actions, internals + else: + return actions + + def observe(self, reward=0.0, terminal=False, parallel=0): + # Check whether inputs are batched + if util.is_iterable(x=reward) or (isinstance(reward, np.ndarray) and reward.ndim > 0): + reward = np.asarray(reward) + num_parallel = reward.shape[0] + if not isinstance(terminal, np.ndarray) and terminal is False: + terminal = np.asarray([0 for _ in range(num_parallel)]) + else: + terminal = np.asarray(terminal) + if not isinstance(parallel, np.ndarray) and parallel == 0: + assert num_parallel == self.parallel_interactions + parallel = np.asarray(list(range(num_parallel))) + else: + parallel = np.asarray(parallel) + + elif util.is_iterable(x=terminal) or \ + (isinstance(terminal, np.ndarray) and terminal.ndim > 0): + terminal = np.asarray(terminal, dtype=util.np_dtype(dtype='int')) + num_parallel = terminal.shape[0] + if not isinstance(reward, np.ndarray) and reward == 0.0: + reward = np.asarray([0.0 for _ in range(num_parallel)]) + else: + reward = np.asarray(reward) + if not isinstance(parallel, np.ndarray) and parallel == 0: + assert num_parallel == self.parallel_interactions + parallel = np.asarray(list(range(num_parallel))) + else: + parallel = np.asarray(parallel) + + elif util.is_iterable(x=parallel) or \ + (isinstance(parallel, np.ndarray) and parallel.ndim > 0): + parallel = np.asarray(parallel) + num_parallel = parallel.shape[0] + if not isinstance(reward, np.ndarray) and reward == 0.0: + reward = np.asarray([0.0 for _ in range(num_parallel)]) + else: + reward = np.asarray(reward) + if not isinstance(terminal, np.ndarray) and terminal is False: + terminal = np.asarray([0 for _ in range(num_parallel)]) + else: + terminal = np.asarray(terminal) + + else: + reward = np.asarray([float(reward)]) + terminal = np.asarray([int(terminal)]) + parallel = np.asarray([int(parallel)]) + num_parallel = 1 + + # Check whether shapes/lengths are consistent + if parallel.shape[0] == 0: + raise TensorforceError.value( + name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0' + ) + if reward.shape != parallel.shape: + raise TensorforceError.value( + name='Agent.observe', argument='len(reward)', value=reward.shape, + hint='!= parallel length' + ) + if terminal.shape != parallel.shape: + raise TensorforceError.value( + name='Agent.observe', argument='len(terminal)', value=terminal.shape, + hint='!= parallel length' + ) + + # Convert terminal to int if necessary + if terminal.dtype is util.np_dtype(dtype='bool'): + zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) + ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) + terminal = np.where(terminal, ones, zeros) + + # Check whether current timesteps are not completed + if self.timestep_completed[parallel].any(): + raise TensorforceError(message="Calling agent.observe must be preceded by agent.act.") + self.timestep_completed[parallel] = True + + # Check whether episode is too long + self.timestep_counter[parallel] += 1 + if self.max_episode_timesteps is not None and np.logical_and( + terminal == 0, self.timestep_counter[parallel] > self.max_episode_timesteps + ).any(): + raise TensorforceError(message="Episode longer than max_episode_timesteps.") + self.timestep_counter[parallel] = np.where(terminal > 0, 0, self.timestep_counter[parallel]) + + if self.recorder is None: + pass + + elif self.num_episodes < self.recorder.get('start', 0): + # Increment num_episodes + for t in terminal.tolist(): + if t > 0: + self.num_episodes += 1 + + else: + # Store values per parallel interaction + for p, t, r in zip(parallel.tolist(), terminal.tolist(), reward.tolist()): + + # Buffer inputs + self.buffers['terminal'][p].append(t) + self.buffers['reward'][p].append(r) + + # Continue if not terminal + if t == 0: + continue + self.num_episodes += 1 + + # Buffered terminal/reward inputs + for name in self.states_spec: + self.recorded['states'][name].append( + np.stack(self.buffers['states'][name][p], axis=0) + ) + self.buffers['states'][name][p].clear() + for name, spec in self.actions_spec.items(): + self.recorded['actions'][name].append( + np.stack(self.buffers['actions'][name][p], axis=0) + ) + self.buffers['actions'][name][p].clear() + self.recorded['terminal'].append( + np.array(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type()) + ) + self.buffers['terminal'][p].clear() + self.recorded['reward'].append( + np.array(self.buffers['reward'][p], dtype=self.reward_spec.np_type()) + ) + self.buffers['reward'][p].clear() + + # Check whether recording step + if (self.num_episodes - self.recorder.get('start', 0)) \ + % self.recorder.get('frequency', 1) != 0: + continue + + # Manage recorder directory + directory = self.recorder['directory'] + if os.path.isdir(directory): + files = sorted( + f for f in os.listdir(directory) + if os.path.isfile(os.path.join(directory, f)) + and os.path.splitext(f)[1] == '.npz' + ) + else: + os.makedirs(directory) + files = list() + max_traces = self.recorder.get('max-traces') + if max_traces is not None and len(files) > max_traces - 1: + for filename in files[:-max_traces + 1]: + filename = os.path.join(directory, filename) + os.remove(filename) + + # Write recording file + filename = os.path.join(directory, 'trace-{:09d}.npz'.format(self.num_episodes - 1)) + # time.strftime('%Y%m%d-%H%M%S') + kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items() + np.savez_compressed(file=filename, **dict(kwargs)) + + # Clear recorded values + for recorded in self.recorded.values(): + recorded.clear() + + if self._is_agent: + return reward, terminal, parallel + else: + return 0 + + def _process_states_input(self, states, function_name): + if self.states_spec.is_singleton() and not isinstance(states, dict) and not ( + util.is_iterable(x=states) and isinstance(states[0], dict) + ): + # Single state + states = np.asarray(states) + if states.shape == self.states_spec.value().shape: + # Single state is not batched + states = ArrayDict(singleton=np.expand_dims(states, axis=0)) + batched = False + num_instances = 1 + is_iter_of_dicts = None + + else: + # Single state is batched, iter[state] + assert states.shape[1:] == self.states_spec.value().shape + assert type(states) in (tuple, list, np.ndarray) + num_instances = states.shape[0] + states = ArrayDict(singleton=states) + batched = True + is_iter_of_dicts = True # Default + + elif util.is_iterable(x=states): + # States is batched, iter[dict[state]] + batched = True + num_instances = len(states) + is_iter_of_dicts = True + assert type(states) in (tuple, list) + if num_instances == 0: + raise TensorforceError.value( + name=function_name, argument='len(states)', value=num_instances, hint='= 0' + ) + for n, state in enumerate(states): + if not isinstance(state, dict): + raise TensorforceError.type( + name=function_name, argument='states[{}]'.format(n), dtype=type(state), + hint='is not dict' + ) + # Turn iter of dicts into dict of arrays + # (Doesn't use self.states_spec since states also contains auxiliaries) + states = [ArrayDict(state) for state in states] + states = states[0].fmap( + function=(lambda *xs: np.stack(xs, axis=0)), zip_values=states[1:] + ) + + elif isinstance(states, dict): + # States is dict, turn into arrays + states = ArrayDict(states) + name, spec = self.states_spec.item() + if name is None: + name = 'state' + + if states[name].shape == spec.shape: + # States is not batched, dict[state] + states = states.fmap(function=(lambda state: np.expand_dims(state, axis=0))) + batched = False + num_instances = 1 + is_iter_of_dicts = None + + else: + # States is batched, dict[iter[state]] + assert states[name].shape[1:] == spec.shape + assert type(states[name]) in (tuple, list, np.ndarray) + batched = True + num_instances = states[name].shape[0] + is_iter_of_dicts = False + if num_instances == 0: + raise TensorforceError.value( + name=function_name, argument='len(states)', value=num_instances, hint='= 0' + ) + + else: + raise TensorforceError.type( + name=function_name, argument='states', dtype=type(states), + hint='is not array/tuple/list/dict' + ) + + # Check number of inputs + if any(state.shape[0] != num_instances for state in states.values()): + raise TensorforceError.value( + name=function_name, argument='len(states)', + value=[state.shape[0] for state in states.values()], hint='inconsistent' + ) + + return states, batched, num_instances, is_iter_of_dicts diff --git a/tensorforce/agents/tensorforce.py b/tensorforce/agents/tensorforce.py new file mode 100644 index 000000000..f38ed5110 --- /dev/null +++ b/tensorforce/agents/tensorforce.py @@ -0,0 +1,738 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict +import os +from random import shuffle + +import numpy as np + +from tensorforce import TensorforceError, util +from tensorforce.agents import Agent +from tensorforce.core import ArrayDict +from tensorforce.core.models import TensorforceModel + + +class TensorforceAgent(Agent): + """ + Tensorforce agent (specification key: `tensorforce`). + + Highly configurable agent and basis for a broad class of deep reinforcement learning agents, + which act according to a policy parametrized by a neural network, leverage a memory module for + periodic updates based on batches of experience, and optionally employ a baseline/critic/target + policy for improved reward estimation. + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create()`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
                +
              • type ("bool" | "int" | "float") – state data type + (default: "float").
              • +
              • shape (int | iter[int]) – state shape + (required).
              • +
              • num_values (int > 0) – number of discrete state values + (required for type "int").
              • +
              • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
              • +
              + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create()`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
                +
              • type ("bool" | "int" | "float") – action data type + (required).
              • +
              • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
              • +
              • num_values (int > 0) – number of discrete action values + (required for type "int").
              • +
              • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
              • +
              + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create()`). + + policy (specification): Policy configuration, see [networks](../modules/networks.html) and + [policies documentation](../modules/policies.html) + (default: action distributions or value + functions parametrized by an automatically configured network). + memory (int | specification): Replay memory capacity, or memory configuration, see the + [memories documentation](../modules/memories.html) + (default: minimum capacity recent memory). + update (int | specification): Model update configuration with the following attributes + (required, + default: timesteps batch size): +
                +
              • unit ("timesteps" | "episodes") – unit for update attributes + (required).
              • +
              • batch_size + (parameter, int > 0) – + size of update batch in number of units + (required).
              • +
              • frequency + ("never" | parameter, int > 0 | 0.0 < float <= 1.0) – + frequency of updates, relative to batch_size if float + (default: batch_size).
              • +
              • start + (parameter, int >= batch_size) – + number of units before first update + (default: none).
              • +
              + optimizer (specification): Optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html) + (default: Adam optimizer). + objective (specification): Optimization objective configuration, see the + [objectives documentation](../modules/objectives.html) + (required). + reward_estimation (specification): Reward estimation configuration with the following + attributes (required): +
                +
              • horizon + ("episode" | parameter, int >= 1) + – Horizon of discounted-sum return estimation + (required).
              • +
              • discount + (parameter, 0.0 <= float <= 1.0) – + Discount factor of future rewards for discounted-sum return estimation + (default: 1.0).
              • +
              • predict_horizon_values (false | "early" | "late") – Whether to + include a baseline prediction of the horizon value as part of the return estimation, and + if so, whether to compute the horizon value prediction "early" when experiences are + stored to memory, or "late" when batches of experience are retrieved for the update + (default: "late" if baseline_policy or + baseline_objective are specified, else false).
              • +
              • estimate_advantage (False | "early" | "late") – Whether to use + an estimate of the advantage (return minus baseline value prediction) instead of the + return as learning signal, and whether to do so late after the baseline update + (default) or early before the baseline update + (default: false, unless baseline_policy is + specified but baseline_objective/optimizer are not).
              • +
              • predict_action_values (bool) – Whether to predict state-action- + instead of state-values as horizon values and for advantage estimation + (default: false).
              • +
              • reward_processing (specification)) – Reward preprocessing as + layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing).
              • +
              • return_processing (specification) – Return processing as layer + or list of layers, see the [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing).
              • +
              • advantage_processing (specification) – Advantage processing as + layer or list of layers, see the [preprocessing documentation](../modules/preprocessing.html) + (default: no advantage processing).
              • +
              • predict_terminal_values (bool) – Whether to predict the value + of terminal states, usually not required since max_episode_timesteps terminals are + handled separately + (default: false).
              • +
              + + baseline (specification): Baseline configuration, policy will be used as baseline if none, + see [networks](../modules/networks.html) and potentially + [policies documentation](../modules/policies.html) + (default: none). + baseline_optimizer (specification | parameter, float > 0.0): + Baseline optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), + main optimizer will be used for baseline if none, a float implies none and specifies a + custom weight for the baseline loss + (default: none). + baseline_objective (specification): Baseline optimization objective configuration, see the + [objectives documentation](../modules/objectives.html), + required if baseline optimizer is specified, main objective will be used for baseline if + baseline objective and optimizer are not specified + (default: none). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise). + + parallel_interactions (int > 0): Maximum number of parallel interactions to support, + for instance, to enable multiple parallel episodes, environments or agents within an + environment + (default: 1). + config (specification): Additional configuration options: +
                +
              • name (string) – Agent name, used e.g. for TensorFlow scopes and + saver default filename + (default: "agent"). +
              • device (string) – Device name + (default: CPU). Different from (un)supervised + deep learning, RL does not always benefit from running on a GPU, depending on + environment and agent configuration. In particular for RL-typical environments with + low-dimensional state spaces (i.e., no images), one usually gets better performance by + running on CPU only. Consequently, Tensorforce is configured to run on CPU by default, + which can be changed, for instance, by setting this value to 'GPU' instead. +
              • seed (int) – Random seed to set for Python, NumPy (both set + globally!) and TensorFlow, environment seed may have to be set separately for fully + deterministic execution, generally not recommended since results in a fully + deterministic setting are less meaningful/representative + (default: none).
              • +
              • buffer_observe (false | "episode" | int > 0) – Number of + timesteps within an episode to buffer before calling the internal observe function, to + reduce calls to TensorFlow for improved performance + (default: configuration-specific maximum + number which can be buffered without affecting performance).
              • +
              • enable_int_action_masking (bool) – Whether int action options + can be masked via an optional "[ACTION-NAME]_mask" state input + (default: true).
              • +
              • create_tf_assertions (bool) – Whether to create internal + TensorFlow assertion operations + (default: true).
              • +
              • eager_mode (bool) – Whether to run functions eagerly instead of + running as a traced graph function, can be helpful for debugging + (default: false).
              • +
              • tf_log_level (int >= 0) – TensorFlow log level, additional C++ + logging messages can be enabled by setting os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"/"2" + before importing Tensorforce/TensorFlow + (default: 40, only error and critical).
              • +
              + saver (path | specification): TensorFlow checkpoints directory, or checkpoint manager + configuration with the following attributes, for periodic implicit saving as alternative + to explicit saving via agent.save() + (default: no saver): +
                +
              • directory (path) – checkpoint directory + (required).
              • +
              • filename (string) – checkpoint filename + (default: agent name).
              • +
              • frequency (int > 0) – how frequently to save a checkpoint + (required).
              • +
              • unit ("timesteps" | "episodes" | "updates") – frequency unit + (default: updates).
              • +
              • max_checkpoints (int > 0) – maximum number of checkpoints to + keep (default: 10).
              • +
              • max_hour_frequency (int > 0) – ignoring max-checkpoints, + definitely keep a checkpoint in given hour frequency + (default: none).
              • +
              + summarizer (path | specification): TensorBoard summaries directory, or summarizer + configuration with the following attributes + (default: no summarizer): +
                +
              • directory (path) – summarizer directory + (required).
              • +
              • filename (path) – summarizer filename, max_summaries does not + apply if name specified + (default: "summary-%Y%m%d-%H%M%S").
              • +
              • max_summaries (int > 0) – maximum number of (generically-named) + summaries to keep + (default: 7, number of different colors in + Tensorboard).
              • +
              • flush (int > 0) – how frequently in seconds to flush the + summary writer (default: 10).
              • +
              • summaries ("all" | iter[string]) – which summaries to record, + "all" implies all numerical summaries, so all summaries except "graph" + (default: "all"):
              • +
              • "action-value": value of each action (timestep-based)
              • +
              • "distribution": distribution parameters like probabilities or mean and stddev + (timestep-based)
              • +
              • "entropy": entropy of (per-action) policy distribution(s) (timestep-based)
              • +
              • "graph": computation graph
              • +
              • "kl-divergence": KL-divergence of previous and updated (per-action) policy + distribution(s) (update-based)
              • +
              • "loss": policy and baseline loss plus loss components (update-based)
              • +
              • "parameters": parameter values (according to parameter unit)
              • +
              • "reward": reward per timestep, episode length and reward, plus intermediate + reward/return/advantage estimates and processed values + (timestep/episode/update-based)
              • +
              • "update-norm": global norm of update (update-based)
              • +
              • "updates": mean and variance of update tensors per variable (update-based)
              • +
              • "variables": mean of trainable variables tensors (update-based)
              • +
              + tracking ("all" | iter[string]): Which tensors to track, available values are a subset of + the values of summarizer[summaries] above + (default: no tracking). + The current value of tracked tensors can be retrieved via tracked_tensors() at any time, + however, note that tensor values change at different timescales (timesteps, episodes, + updates). + recorder (path | specification): Traces recordings directory, or recorder configuration with + the following attributes (see + [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) + for example application) + (default: no recorder): +
                +
              • directory (path) – recorder directory + (required).
              • +
              • frequency (int > 0) – how frequently in episodes to record + traces (default: every episode).
              • +
              • start (int >= 0) – how many episodes to skip before starting to + record traces (default: 0).
              • +
              • max-traces (int > 0) – maximum number of traces to keep + (default: all).
              • + """ + + def __init__( + # Required + self, states, actions, update, optimizer, objective, reward_estimation, + # Environment + max_episode_timesteps=None, + # Agent + policy='auto', memory=None, + # Baseline + baseline=None, baseline_optimizer=None, baseline_objective=None, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_actions' in reward_estimation: + raise TensorforceError.deprecated( + name='Agent', argument='reward_estimation[estimate_actions]', + replacement='reward_estimation[predict_action_values]' + ) + if 'estimate_terminal' in reward_estimation: + raise TensorforceError.deprecated( + name='Agent', argument='reward_estimation[estimate_terminal]', + replacement='reward_estimation[predict_terminal_values]' + ) + if summarizer is not None and 'labels' in summarizer: + raise TensorforceError.deprecated( + name='Agent', argument='summarizer[labels]', replacement='summarizer[summaries]' + ) + if 'baseline_policy' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='baseline_policy', replacement='baseline' + ) + if 'reward_preprocessing' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='reward_preprocessing', + replacement='reward_estimation[reward_processing]' + ) + if 'name' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='name', replacement='config[name]' + ) + if 'buffer_observe' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='buffer_observe', replacement='config[buffer_observe]' + ) + if 'device' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='device', replacement='config[device]' + ) + if 'seed' in kwargs: + raise TensorforceError.deprecated( + name='Agent', argument='seed', replacement='config[seed]' + ) + if len(kwargs) > 0: + raise TensorforceError.invalid(name='Agent', argument=', '.join(kwargs)) + + if not hasattr(self, 'spec'): + self.spec = OrderedDict( + agent='tensorforce', + # Environment + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + # Agent + policy=policy, memory=memory, update=update, optimizer=optimizer, + objective=objective, reward_estimation=reward_estimation, + # Baseline + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + # Regularization + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + # Preprocessing + state_preprocessing=state_preprocessing, + # Exploration + exploration=exploration, variable_noise=variable_noise, + # Parallel interactions + parallel_interactions=parallel_interactions, + # Config, saver, summarizer, recorder + config=config, saver=saver, summarizer=summarizer, tracking=tracking, + recorder=recorder + ) + + if memory is None: + memory = dict(type='recent') + + if isinstance(update, int): + update = dict(unit='timesteps', batch_size=update) + + if config is None: + config = dict() + else: + config = dict(config) + + # TODO: should this change if summarizer is specified? + if parallel_interactions > 1: + if 'buffer_observe' not in config: + if max_episode_timesteps is None: + raise TensorforceError.required( + name='Agent', argument='max_episode_timesteps', + condition='parallel_interactions > 1' + ) + config['buffer_observe'] = 'episode' + # elif config['buffer_observe'] < max_episode_timesteps: + # raise TensorforceError.value( + # name='Agent', argument='config[buffer_observe]', + # hint='< max_episode_timesteps', condition='parallel_interactions > 1' + # ) + + elif update['unit'] == 'timesteps': + update_frequency = update.get('frequency', update['batch_size']) + if 'buffer_observe' not in config: + if isinstance(update_frequency, int): + config['buffer_observe'] = update_frequency + else: + config['buffer_observe'] = 1 + elif isinstance(update_frequency, int) and ( + config['buffer_observe'] == 'episode' or config['buffer_observe'] > update_frequency + ): + raise TensorforceError.value( + name='Agent', argument='config[buffer_observe]', value=config['buffer_observe'], + hint='> update[frequency]', condition='update[unit] = "timesteps"' + ) + + elif update['unit'] == 'episodes': + if 'buffer_observe' not in config: + config['buffer_observe'] = 'episode' + + # reward_estimation = dict(reward_estimation) + # if reward_estimation['horizon'] == 'episode': + # if max_episode_timesteps is None: + # raise TensorforceError.required( + # name='Agent', argument='max_episode_timesteps', + # condition='reward_estimation[horizon] = "episode"' + # ) + # reward_estimation['horizon'] = max_episode_timesteps + + super().__init__( + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder + ) + + self.model = TensorforceModel( + states=self.states_spec, actions=self.actions_spec, + max_episode_timesteps=self.max_episode_timesteps, + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + parallel_interactions=self.parallel_interactions, + config=self.config, saver=saver, summarizer=summarizer, tracking=tracking + ) + + def experience(self, states, actions, terminal, reward, internals=None): + """ + Feed experience traces. + + See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) + for an example application as part of the act-experience-update interface, which is an + alternative to the act-observe interaction pattern. + + Args: + states (dict[array[state]]): Dictionary containing arrays of states + (required). + actions (dict[array[action]]): Dictionary containing arrays of actions + (required). + terminal (array[bool]): Array of terminals + (required). + reward (array[float]): Array of rewards + (required). + internals (dict[state]): Dictionary containing arrays of internal agent states + (required if agent has internal states). + """ + if not all(len(buffer) == 0 for buffer in self.terminal_buffer): + raise TensorforceError(message="Calling agent.experience is not possible mid-episode.") + + # Process states input and infer batching structure + states, batched, num_instances, is_iter_of_dicts = self._process_states_input( + states=states, function_name='Agent.experience' + ) + + if is_iter_of_dicts: + # Input structure iter[dict[input]] + + # Internals + if internals is None: + internals = ArrayDict(self.initial_internals()) + internals = internals.fmap(function=(lambda x: np.repeat(np.expand_dims(x, axis=0), repeats=num_instances, axis=0))) + elif not isinstance(internals, (tuple, list)): + raise TensorforceError.type( + name='Agent.experience', argument='internals', dtype=type(internals), + hint='is not tuple/list' + ) + else: + internals = [ArrayDict(internal) for internal in internals] + internals = internals[0].fmap( + function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:] + ) + + # Actions + if isinstance(actions, np.ndarray): + actions = ArrayDict(singleton=actions) + elif not isinstance(actions, (tuple, list)): + raise TensorforceError.type( + name='Agent.experience', argument='actions', dtype=type(actions), + hint='is not tuple/list' + ) + elif not isinstance(actions[0], dict): + actions = ArrayDict(singleton=np.asarray(actions)) + elif all(list(action) == ['action'] for action in actions): + actions = [ArrayDict(singleton=action['action']) for action in actions] + actions = actions[0].fmap( + function=(lambda *xs: np.stack(xs, axis=0)), zip_values=actions[1:] + ) + else: + actions = [ArrayDict(action) for action in actions] + actions = actions[0].fmap( + function=(lambda *xs: np.stack(xs, axis=0)), zip_values=actions[1:] + ) + + else: + # Input structure dict[iter[input]] + + # Internals + if internals is None: + internals = ArrayDict(self.initial_internals()) + internals = internals.fmap(function=(lambda x: np.tile(np.expand_dims(x, axis=0), reps=(num_instances,)))) + elif not isinstance(internals, dict): + raise TensorforceError.type( + name='Agent.experience', argument='internals', dtype=type(internals), + hint='is not dict' + ) + else: + internals = ArrayDict(internals) + + # Actions + if isinstance(actions, np.ndarray): + actions = ArrayDict(singleton=actions) + elif not isinstance(actions, dict): + raise TensorforceError.type( + name='Agent.experience', argument='actions', dtype=type(actions), + hint='is not dict' + ) + elif list(actions) == ['action']: + actions = ArrayDict(singleton=actions['action']) + else: + actions = ArrayDict(actions) + + # Expand inputs if not batched + if not batched: + internals = internals.fmap(function=(lambda x: np.expand_dims(x, axis=0))) + actions = actions.fmap(function=(lambda x: np.expand_dims(x, axis=0))) + terminal = np.asarray([terminal]) + reward = np.asarray([reward]) + else: + terminal = np.asarray(terminal) + reward = np.asarray(reward) + + # Check number of inputs + for name, internal in internals.items(): + if internal.shape[0] != num_instances: + raise TensorforceError.value( + name='Agent.experience', argument='len(internals[{}])'.format(name), + value=internal.shape[0], hint='!= len(states)' + ) + for name, action in actions.items(): + if action.shape[0] != num_instances: + raise TensorforceError.value( + name='Agent.experience', argument='len(actions[{}])'.format(name), + value=action.shape[0], hint='!= len(states)' + ) + if terminal.shape[0] != num_instances: + raise TensorforceError.value( + name='Agent.experience', argument='len(terminal)'.format(name), + value=terminal.shape[0], hint='!= len(states)' + ) + if reward.shape[0] != num_instances: + raise TensorforceError.value( + name='Agent.experience', argument='len(reward)'.format(name), + value=reward.shape[0], hint='!= len(states)' + ) + + def function(name, spec): + auxiliary = ArrayDict() + if self.config.enable_int_action_masking and spec.type == 'int' and \ + spec.num_values is not None: + if name is None: + name = 'action' + # Mask, either part of states or default all true + auxiliary['mask'] = states.pop(name + '_mask', np.ones( + shape=(num_instances,) + spec.shape + (spec.num_values,), dtype=spec.np_type() + )) + return auxiliary + + auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) + if self.states_spec.is_singleton() and not states.is_singleton(): + states[None] = states.pop('state') + + # Convert terminal to int if necessary + if terminal.dtype is util.np_dtype(dtype='bool'): + zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) + ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) + terminal = np.where(terminal, ones, zeros) + + if terminal[-1] == 0: + raise TensorforceError(message="Agent.experience() requires full episodes as input.") + + # Batch experiences split into episodes and at most size buffer_observe + last = 0 + for index in range(1, len(terminal) + 1): + if terminal[index - 1] == 0: + continue + + function = (lambda x: x[last: index]) + states_batch = states.fmap(function=function) + internals_batch = internals.fmap(function=function) + auxiliaries_batch = auxiliaries.fmap(function=function) + actions_batch = actions.fmap(function=function) + terminal_batch = function(terminal) + reward_batch = function(reward) + last = index + + # Inputs to tensors + states_batch = self.states_spec.to_tensor( + value=states_batch, batched=True, name='Agent.experience states' + ) + internals_batch = self.internals_spec.to_tensor( + value=internals_batch, batched=True, recover_empty=True, + name='Agent.experience internals' + ) + auxiliaries_batch = self.auxiliaries_spec.to_tensor( + value=auxiliaries_batch, batched=True, name='Agent.experience auxiliaries' + ) + actions_batch = self.actions_spec.to_tensor( + value=actions_batch, batched=True, name='Agent.experience actions' + ) + terminal_batch = self.terminal_spec.to_tensor( + value=terminal_batch, batched=True, name='Agent.experience terminal' + ) + reward_batch = self.reward_spec.to_tensor( + value=reward_batch, batched=True, name='Agent.experience reward' + ) + + # Model.experience() + timesteps, episodes = self.model.experience( + states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, + actions=actions_batch, terminal=terminal_batch, reward=reward_batch + ) + self.timesteps = timesteps.numpy().item() + self.episodes = episodes.numpy().item() + + if self.model.saver is not None: + self.model.save() + + def update(self, query=None, **kwargs): + """ + Perform an update. + + See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) + for an example application as part of the act-experience-update interface, which is an + alternative to the act-observe interaction pattern. + """ + updates = self.model.update() + self.updates = updates.numpy().item() + + if self.model.saver is not None: + self.model.save() + + def pretrain(self, directory, num_iterations, num_traces=1, num_updates=1, extension='.npz'): + """ + Simple pretraining approach as a combination of `experience()` and `update`, akin to + behavioral cloning, using experience traces obtained e.g. via recording agent interactions + ([see documentation](https://tensorforce.readthedocs.io/en/latest/basics/features.html#record-pretrain)). + + For the given number of iterations, load the given number of trace files (which each contain + recorder[frequency] episodes), feed the experience to the agent's internal memory, and + subsequently trigger the given number of updates (which will use the experience in the + internal memory, fed in this or potentially previous iterations). + + See the [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) + for an example application. + + Args: + directory (path): Directory with experience traces, e.g. obtained via recorder; episode + length has to be consistent with agent configuration + (required). + num_iterations (int > 0): Number of iterations consisting of loading new traces and + performing multiple updates + (required). + num_traces (int > 0): Number of traces to load per iteration; has to at least satisfy + the update batch size + (default: 1). + num_updates (int > 0): Number of updates per iteration + (default: 1). + extension (str): Traces file extension to filter the given directory for + (default: ".npz"). + """ + if not os.path.isdir(directory): + raise TensorforceError.value( + name='agent.pretrain', argument='directory', value=directory + ) + files = sorted( + os.path.join(directory, f) for f in os.listdir(directory) + if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == extension + ) + indices = list(range(len(files))) + + for _ in range(num_iterations): + shuffle(indices) + if num_traces is None: + selection = indices + else: + selection = indices[:num_traces] + + batch = None + for index in selection: + trace = ArrayDict(np.load(files[index])) + if batch is None: + batch = trace + else: + batch = batch.fmap( + function=(lambda x, y: np.concatenate([x, y], axis=0)), zip_values=(trace,) + ) + + for name, value in batch.pop('auxiliaries', dict()).items(): + assert name.endswith('/mask') + batch['states'][name[:-5] + '_mask'] = value + + self.experience(**batch.to_kwargs()) + for _ in range(num_updates): + self.update() + # TODO: self.obliviate() diff --git a/tensorforce/agents/trpo.py b/tensorforce/agents/trpo.py new file mode 100644 index 000000000..78ac36c68 --- /dev/null +++ b/tensorforce/agents/trpo.py @@ -0,0 +1,265 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class TrustRegionPolicyOptimization(TensorforceAgent): + """ + [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477) agent (specification key: + `trpo`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
                  +
                • type ("bool" | "int" | "float") – state data type + (default: "float").
                • +
                • shape (int | iter[int]) – state shape + (required).
                • +
                • num_values (int > 0) – number of discrete state values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
                • +
                + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
                  +
                • type ("bool" | "int" | "float") – action data type + (required).
                • +
                • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
                • +
                • num_values (int > 0) – number of discrete action values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
                • +
                + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + batch_size (parameter, int > 0): Number of episodes + per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + + memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + 1 episodes + (default: minimum capacity, usually does not + need to be changed). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-2). + linesearch_iterations (parameter, int >= 0): + Maximum number of line search iterations + (default: 10). + subsampling_fraction (parameter, int > 0 | 0.0 < float <= 1.0): + Absolute/relative fraction of batch timesteps to subsample for computation of natural + gradient update + (default: no subsampling). + + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + advantage_processing (specification): Advantage processing as layer or list of layers, see + the [preprocessing documentation](../modules/preprocessing.html) + (default: no advantage processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + baseline (specification): Baseline network configuration, see the + [networks documentation](../modules/networks.html), + main policy will be used as baseline if none + (default: none). + baseline_optimizer (float > 0.0 | specification): Baseline optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), main optimizer will be used for + baseline if none, a float implies none and specifies a custom weight for the baseline + loss + (default: none). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

                + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Required + self, states, actions, max_episode_timesteps, batch_size, + # Network + network='auto', use_beta_distribution=False, + # Memory + memory='minimum', + # Optimization + update_frequency=1.0, learning_rate=1e-2, linesearch_iterations=10, + subsampling_fraction=1.0, + # Reward estimation + discount=0.99, reward_processing=None, return_processing=None, advantage_processing=None, + predict_terminal_values=False, + # Baseline + baseline=None, baseline_optimizer=None, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='TRPO', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'critic_network' in kwargs: + raise TensorforceError.deprecated( + name='TRPO', argument='critic_network', replacement='baseline' + ) + if 'baseline_network' in kwargs: + raise TensorforceError.deprecated( + name='TRPO', argument='baseline_network', replacement='baseline' + ) + if 'critic_optimizer' in kwargs: + raise TensorforceError.deprecated( + name='TRPO', argument='critic_optimizer', replacement='baseline_optimizer' + ) + + self.spec = OrderedDict( + agent='trpo', + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + batch_size=batch_size, + network=network, use_beta_distribution=use_beta_distribution, + memory=memory, + update_frequency=update_frequency, learning_rate=learning_rate, + discount=discount, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values, + baseline=baseline, baseline_optimizer=baseline_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=1.0, + use_beta_distribution=use_beta_distribution + ) + + if memory == 'minimum': + memory = dict(type='recent') + else: + memory = dict(type='recent', capacity=memory) + + update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency) + + optimizer = dict( + optimizer='natural_gradient', learning_rate=learning_rate, only_positive_updates=True, + subsampling_fraction=subsampling_fraction, linesearch_iterations=linesearch_iterations + ) + objective = dict(type='policy_gradient', importance_sampling=True) + + if baseline is None: + assert not predict_terminal_values + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values=False, + estimate_advantage=False, reward_processing=reward_processing, + return_processing=return_processing + ) + assert baseline_optimizer is None + baseline_objective = None + + else: + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values='early', + estimate_advantage=True, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values + ) + baseline = dict(type='parametrized_state_value', network=baseline) + assert baseline_optimizer is not None + baseline_objective = dict(type='state_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/trpo_agent.py b/tensorforce/agents/trpo_agent.py deleted file mode 100755 index d5b151c09..000000000 --- a/tensorforce/agents/trpo_agent.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import PGProbRatioModel - - -class TRPOAgent(LearningAgent): - """ - Trust Region Policy Optimization agent - ([Schulman et al., 2015](https://arxiv.org/abs/1502.05477)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='trpo', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - baseline_mode=None, - baseline=None, - baseline_optimizer=None, - gae_lambda=None, - likelihood_ratio_clipping=None, - learning_rate=1e-3, - cg_max_iterations=20, - cg_damping=1e-3, - cg_unroll_loop=False, - ls_max_iterations=10, - ls_accept_ratio=0.9, - ls_unroll_loop=False - ): - """ - Initializes the TRPO agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'episodes' if given (default: 'episodes'). - - batch_size: integer (default: 10). - - frequency: integer (default: batch_size). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='latest', include_next_states=false, capacity=1000*batch_size}). - optimizer (spec): TRPO agent implicitly defines a optimized-step natural-gradient - optimizer. - baseline_mode (str): One of 'states', 'network' (default: none). - baseline (spec): Baseline specification, see core.baselines module for more information - (default: none). - baseline_optimizer (spec): Baseline optimizer specification, see core.optimizers module - for more information (default: none). - gae_lambda (float): Lambda factor for generalized advantage estimation (default: none). - likelihood_ratio_clipping (float): Likelihood ratio clipping for policy gradient - (default: none). - learning_rate (float): Learning rate of natural-gradient optimizer (default: 1e-3). - cg_max_iterations (int): Conjugate-gradient max iterations (default: 20). - cg_damping (float): Conjugate-gradient damping (default: 1e-3). - cg_unroll_loop (bool): Conjugate-gradient unroll loop (default: false). - ls_max_iterations (int): Line-search max iterations (default: 10). - ls_accept_ratio (float): Line-search accept ratio (default: 0.9). - ls_unroll_loop (bool): Line-search unroll loop (default: false). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='episodes', - batch_size=10 - ) - elif 'unit' in update_mode: - assert update_mode['unit'] == 'episodes' - else: - update_mode['unit'] = 'episodes' - - # Memory - if memory is None: - # Assumed episode length of 1000 timesteps. - memory = dict( - type='latest', - include_next_states=False, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert not memory['include_next_states'] - - # Optimizer - optimizer = dict( - type='optimized_step', - optimizer=dict( - type='natural_gradient', - learning_rate=learning_rate, - cg_max_iterations=cg_max_iterations, - cg_damping=cg_damping, - cg_unroll_loop=cg_unroll_loop, - ), - ls_max_iterations=ls_max_iterations, - ls_accept_ratio=ls_accept_ratio, - ls_mode='exponential', # !!!!!!!!!!!!! - ls_parameter=0.5, # !!!!!!!!!!!!! - ls_unroll_loop=ls_unroll_loop - ) - - self.baseline_mode = baseline_mode - self.baseline = baseline - self.baseline_optimizer = baseline_optimizer - self.gae_lambda = gae_lambda - self.likelihood_ratio_clipping = likelihood_ratio_clipping - - super(TRPOAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return PGProbRatioModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - discount=self.discount, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - baseline_mode=self.baseline_mode, - baseline=self.baseline, - baseline_optimizer=self.baseline_optimizer, - gae_lambda=self.gae_lambda, - likelihood_ratio_clipping=self.likelihood_ratio_clipping - ) diff --git a/tensorforce/agents/vpg.py b/tensorforce/agents/vpg.py new file mode 100644 index 000000000..5c20ba911 --- /dev/null +++ b/tensorforce/agents/vpg.py @@ -0,0 +1,246 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError +from tensorforce.agents import TensorforceAgent + + +class VanillaPolicyGradient(TensorforceAgent): + """ + [Vanilla Policy Gradient](https://link.springer.com/article/10.1007/BF00992696) aka REINFORCE + agent (specification key: `vpg` or `reinforce`). + + Args: + states (specification): States specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of state + descriptions (usually taken from `Environment.states()`) with the following attributes: +
                  +
                • type ("bool" | "int" | "float") – state data type + (default: "float").
                • +
                • shape (int | iter[int]) – state shape + (required).
                • +
                • num_values (int > 0) – number of discrete state values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
                • +
                + actions (specification): Actions specification + (required, better implicitly specified via + `environment` argument for `Agent.create(...)`), arbitrarily nested dictionary of + action descriptions (usually taken from `Environment.actions()`) with the following + attributes: +
                  +
                • type ("bool" | "int" | "float") – action data type + (required).
                • +
                • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
                • +
                • num_values (int > 0) – number of discrete action values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
                • +
                + max_episode_timesteps (int > 0): Upper bound for numer of timesteps per episode + (default: not given, better implicitly + specified via `environment` argument for `Agent.create(...)`). + + batch_size (parameter, int > 0): Number of episodes + per update batch + (required). + + network ("auto" | specification): Policy network configuration, see the + [networks documentation](../modules/networks.html) + (default: "auto", automatically configured + network). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + + memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + 1 episodes + (default: minimum capacity, usually does not + need to be changed). + + update_frequency ("never" | parameter, int > 0 | 0.0 < float <= 1.0): + Frequency of updates, relative to batch_size if float + (default: batch_size). + learning_rate (parameter, float > 0.0): Optimizer + learning rate + (default: 1e-3). + + discount (parameter, 0.0 <= float <= 1.0): Discount + factor for future rewards of discounted-sum reward estimation + (default: 0.99). + return_processing (specification): Return processing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no return processing). + advantage_processing (specification): Advantage processing as layer or list of layers, see + the [preprocessing documentation](../modules/preprocessing.html) + (default: no advantage processing). + predict_terminal_values (bool): Whether to predict the value of terminal states, usually + not required since max_episode_timesteps terminals are handled separately + (default: false). + reward_processing (specification): Reward preprocessing as layer or list of layers, see the + [preprocessing documentation](../modules/preprocessing.html) + (default: no reward processing). + + baseline (specification): Baseline network configuration, see the + [networks documentation](../modules/networks.html), main policy will be used as baseline + if none + (default: none). + baseline_optimizer (float > 0.0 | specification): Baseline optimizer configuration, see the + [optimizers documentation](../modules/optimizers.html), main optimizer will be used for + baseline if none, a float implies none and specifies a custom weight for the baseline + loss + (default: none). + + l2_regularization (parameter, float >= 0.0): + L2 regularization loss weight + (default: no L2 regularization). + entropy_regularization (parameter, float >= 0.0): + Entropy regularization loss weight, to discourage the policy distribution from being + "too certain" + (default: no entropy regularization). + + state_preprocessing (dict[specification]): State preprocessing as layer or list of layers, + see the [preprocessing documentation](../modules/preprocessing.html), + specified per state-type or -name + (default: linear normalization of bounded + float states to [-2.0, 2.0]). + exploration (parameter | dict[parameter], float >= 0.0): + Exploration, defined as the probability for uniformly random output in case of `bool` + and `int` actions, and the standard deviation of Gaussian noise added to every output in + case of `float` actions, specified globally or per action-type or -name + (default: no exploration). + variable_noise (parameter, float >= 0.0): + Add Gaussian noise with given standard deviation to all trainable variables, as + alternative exploration mechanism + (default: no variable noise).

                + + >>>: For arguments below, see the [Tensorforce agent documentation](tensorforce.html). + parallel_interactions (int > 0) + config (specification) + saver (path | specification) + summarizer (path | specification) + tracking ("all" | iter[string]) + recorder (path | specification) + """ + + def __init__( + # Environment + self, states, actions, max_episode_timesteps, batch_size, + # Network + network='auto', use_beta_distribution=False, + # Memory + memory='minimum', + # Optimization + update_frequency=1.0, learning_rate=1e-3, + # Reward estimation + discount=0.99, reward_processing=None, return_processing=None, advantage_processing=None, + predict_terminal_values=False, + # Baseline + baseline=None, baseline_optimizer=None, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Parallel interactions + parallel_interactions=1, + # Config, saver, summarizer, tracking, recorder + config=None, saver=None, summarizer=None, tracking=None, recorder=None, + # Deprecated + **kwargs + ): + if 'estimate_terminal' in kwargs: + raise TensorforceError.deprecated( + name='VPG', argument='estimate_terminal', replacement='predict_terminal_values' + ) + if 'baseline_network' in kwargs: + raise TensorforceError.deprecated( + name='VPG', argument='baseline_network', replacement='baseline' + ) + + self.spec = OrderedDict( + agent='vpg', + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + batch_size=batch_size, + network=network, use_beta_distribution=use_beta_distribution, + memory=memory, + update_frequency=update_frequency, learning_rate=learning_rate, + discount=discount, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values, + baseline=baseline, baseline_optimizer=baseline_optimizer, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + parallel_interactions=parallel_interactions, + config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder + ) + + policy = dict( + type='parametrized_distributions', network=network, temperature=1.0, + use_beta_distribution=use_beta_distribution + ) + + if memory == 'minimum': + memory = dict(type='recent') + else: + memory = dict(type='recent', capacity=memory) + + update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency) + + optimizer = dict(type='adam', learning_rate=learning_rate) + objective = 'policy_gradient' + + if baseline is None: + assert not predict_terminal_values + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values=False, + estimate_advantage=False, reward_processing=reward_processing, + return_processing=return_processing + ) + assert baseline_optimizer is None + baseline_objective = None + + else: + reward_estimation = dict( + horizon='episode', discount=discount, predict_horizon_values='early', + estimate_advantage=True, predict_action_values=False, + reward_processing=reward_processing, return_processing=return_processing, + advantage_processing=advantage_processing, + predict_terminal_values=predict_terminal_values + ) + baseline = dict(type='parametrized_state_value', network=baseline) + assert baseline_optimizer is not None + baseline_objective = dict(type='state_value') + + super().__init__( + # Agent + states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, + parallel_interactions=parallel_interactions, config=config, recorder=recorder, + # TensorforceModel + policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, + baseline=baseline, baseline_optimizer=baseline_optimizer, + baseline_objective=baseline_objective, + l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, + state_preprocessing=state_preprocessing, + exploration=exploration, variable_noise=variable_noise, + saver=saver, summarizer=summarizer, tracking=tracking, **kwargs + ) diff --git a/tensorforce/agents/vpg_agent.py b/tensorforce/agents/vpg_agent.py deleted file mode 100755 index a137f3d18..000000000 --- a/tensorforce/agents/vpg_agent.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.agents import LearningAgent -from tensorforce.models import PGLogProbModel - - -class VPGAgent(LearningAgent): - """ - Vanilla policy gradient agent - ([Williams, 1992)](https://link.springer.com/article/10.1007/BF00992696)). - """ - - def __init__( - self, - states, - actions, - network, - batched_observe=True, - batching_capacity=1000, - scope='vpg', - device=None, - saver=None, - summarizer=None, - distributed=None, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - update_mode=None, - memory=None, - optimizer=None, - discount=0.99, - distributions=None, - entropy_regularization=None, - baseline_mode=None, - baseline=None, - baseline_optimizer=None, - gae_lambda=None - ): - """ - Initializes the VPG agent. - - Args: - update_mode (spec): Update mode specification, with the following attributes: - - unit: 'episodes' if given (default: 'episodes'). - - batch_size: integer (default: 10). - - frequency: integer (default: batch_size). - memory (spec): Memory specification, see core.memories module for more information - (default: {type='latest', include_next_states=false, capacity=1000*batch_size}). - optimizer (spec): Optimizer specification, see core.optimizers module for more - information (default: {type='adam', learning_rate=1e-3}). - baseline_mode (str): One of 'states', 'network' (default: none). - baseline (spec): Baseline specification, see core.baselines module for more information - (default: none). - baseline_optimizer (spec): Baseline optimizer specification, see core.optimizers module - for more information (default: none). - gae_lambda (float): Lambda factor for generalized advantage estimation (default: none). - """ - - # Update mode - if update_mode is None: - update_mode = dict( - unit='episodes', - batch_size=10 - ) - elif 'unit' in update_mode: - # Tests check all modes for VPG. - # assert update_mode['unit'] == 'episodes' - pass - else: - update_mode['unit'] = 'episodes' - - # Memory - if memory is None: - # Assumed episode length of 1000 timesteps. - memory = dict( - type='latest', - include_next_states=False, - capacity=(1000 * update_mode['batch_size']) - ) - else: - assert not memory['include_next_states'] - - # Optimizer - if optimizer is None: - optimizer = dict( - type='adam', - learning_rate=1e-3 - ) - - self.baseline_mode = baseline_mode - self.baseline = baseline - self.baseline_optimizer = baseline_optimizer - self.gae_lambda = gae_lambda - - super(VPGAgent, self).__init__( - states=states, - actions=actions, - batched_observe=batched_observe, - batching_capacity=batching_capacity, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization - ) - - def initialize_model(self): - return PGLogProbModel( - states=self.states, - actions=self.actions, - scope=self.scope, - device=self.device, - saver=self.saver, - summarizer=self.summarizer, - distributed=self.distributed, - batching_capacity=self.batching_capacity, - variable_noise=self.variable_noise, - states_preprocessing=self.states_preprocessing, - actions_exploration=self.actions_exploration, - reward_preprocessing=self.reward_preprocessing, - update_mode=self.update_mode, - memory=self.memory, - optimizer=self.optimizer, - discount=self.discount, - network=self.network, - distributions=self.distributions, - entropy_regularization=self.entropy_regularization, - baseline_mode=self.baseline_mode, - baseline=self.baseline, - baseline_optimizer=self.baseline_optimizer, - gae_lambda=self.gae_lambda - ) diff --git a/tensorforce/contrib/__init__.py b/tensorforce/contrib/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tensorforce/contrib/ale.py b/tensorforce/contrib/ale.py deleted file mode 100644 index 2d714c7ed..000000000 --- a/tensorforce/contrib/ale.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Arcade Learning Environment (ALE). https://github.com/mgbellemare/Arcade-Learning-Environment -""" - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import numpy as np -from ale_python_interface import ALEInterface - -from tensorforce import TensorForceError -from tensorforce.environments import Environment - - -class ALE(Environment): - - def __init__(self, rom, frame_skip=1, repeat_action_probability=0.0, - loss_of_life_termination=False, loss_of_life_reward=0, display_screen=False, - seed=np.random.RandomState()): - """ - Initialize ALE. - - Args: - rom: Rom filename and directory. - frame_skip: Repeat action for n frames. Default 1. - repeat_action_probability: Repeats last action with given probability. Default 0. - loss_of_life_termination: Signals a terminal state on loss of life. Default False. - loss_of_life_reward: Reward/Penalty on loss of life (negative values are a penalty). Default 0. - display_screen: Displays the emulator screen. Default False. - seed: Random seed - """ - - self.ale = ALEInterface() - self.rom = rom - - self.ale.setBool(b'display_screen', display_screen) - self.ale.setInt(b'random_seed', seed.randint(0, 9999)) - self.ale.setFloat(b'repeat_action_probability', repeat_action_probability) - self.ale.setBool(b'color_averaging', False) - self.ale.setInt(b'frame_skip', frame_skip) - - # All set commands must be done before loading the ROM - self.ale.loadROM(rom.encode()) - - # Setup gamescreen object - width, height = self.ale.getScreenDims() - self.gamescreen = np.empty((height, width, 3), dtype=np.uint8) - - self.frame_skip = frame_skip - - # Setup action converter - # ALE returns legal action indexes, convert these to just numbers - self.action_inds = self.ale.getMinimalActionSet() - - # Setup lives - self.loss_of_life_reward = loss_of_life_reward - self.cur_lives = self.ale.lives() - self.loss_of_life_termination = loss_of_life_termination - self.life_lost = False - - def __str__(self): - return 'ALE({})'.format(self.rom) - - def close(self): - self.ale = None - - def reset(self): - self.ale.reset_game() - self.cur_lives = self.ale.lives() - self.life_lost = False - # Clear gamescreen - self.gamescreen = np.empty(self.gamescreen.shape, dtype=np.uint8) - return self.current_state - - def execute(self, actions): - # Convert action to ale action - ale_actions = self.action_inds[actions] - - # Get reward and process terminal & next state - rew = self.ale.act(ale_actions) - if self.loss_of_life_termination or self.loss_of_life_reward != 0: - new_lives = self.ale.lives() - if new_lives < self.cur_lives: - self.cur_lives = new_lives - self.life_lost = True - rew += self.loss_of_life_reward - - terminal = self.is_terminal - state_tp1 = self.current_state - return state_tp1, terminal, rew - - @property - def states(self): - return dict(shape=self.gamescreen.shape, type=float) - - @property - def actions(self): - return dict(type='int', num_actions=len(self.action_inds), names=self.action_names) - - @property - def current_state(self): - self.gamescreen = self.ale.getScreenRGB(self.gamescreen) - return np.copy(self.gamescreen) - - @property - def is_terminal(self): - if self.loss_of_life_termination and self.life_lost: - return True - else: - return self.ale.game_over() - - @property - def action_names(self): - action_names = [ - 'No-Op', - 'Fire', - 'Up', - 'Right', - 'Left', - 'Down', - 'Up Right', - 'Up Left', - 'Down Right', - 'Down Left', - 'Up Fire', - 'Right Fire', - 'Left Fire', - 'Down Fire', - 'Up Right Fire', - 'Up Left Fire', - 'Down Right Fire', - 'Down Left Fire' - ] - return np.asarray(action_names)[self.action_inds] diff --git a/tensorforce/contrib/deepmind_lab.py b/tensorforce/contrib/deepmind_lab.py deleted file mode 100755 index 70363e905..000000000 --- a/tensorforce/contrib/deepmind_lab.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import numpy as np -import deepmind_lab -from tensorforce.environments.environment import Environment - - -class DeepMindLab(Environment): - """ - DeepMind Lab Integration: - https://arxiv.org/abs/1612.03801 - https://github.com/deepmind/lab - - Since DeepMind lab is only available as source code, a manual install - via bazel is required. Further, due to the way bazel handles external - dependencies, cloning TensorForce into lab is the most convenient way to - run it using the bazel BUILD file we provide. To use lab, first download - and install it according to instructions - : - - ```bash - git clone https://github.com/deepmind/lab.git - ``` - - Add to the lab main BUILD file: - - ``` - package(default_visibility = ["//visibility:public"]) - ``` - - Clone TensorForce into the lab directory, then run the TensorForce bazel runner. - - Note that using any specific configuration file currently requires changing the Tensorforce - BUILD file to adjust environment parameters. - - ```bash - bazel run //tensorforce:lab_runner - ``` - - Please note that we have not tried to reproduce any lab results yet, and - these instructions just explain connectivity in case someone wants to - get started there. - - - """ - - def __init__( - self, - level_id, - repeat_action=1, - state_attribute='RGB_INTERLACED', - settings={'width': '320', 'height': '240', 'fps': '60', 'appendCommand': ''} - ): - """ - Initialize DeepMind Lab environment. - - Args: - level_id: string with id/descriptor of the level, e.g. 'seekavoid_arena_01'. - repeat_action: number of frames the environment is advanced, executing the given action during every frame. - state_attribute: Attributes which represents the state for this environment, should adhere to the - specification given in DeepMindLabEnvironment.state_spec(level_id). - settings: dict specifying additional settings as key-value string pairs. The following options - are recognized: 'width' (horizontal resolution of the observation frames), 'height' - (vertical resolution of the observation frames), 'fps' (frames per second) and 'appendCommand' - (commands for the internal Quake console). - - """ - self.level_id = level_id - self.level = deepmind_lab.Lab(level=level_id, observations=[state_attribute], config=settings) - self.repeat_action = repeat_action - self.state_attribute = state_attribute - - def __str__(self): - return 'DeepMindLab({})'.format(self.level_id) - - def close(self): - """ - Closes the environment and releases the underlying Quake III Arena instance. - No other method calls possible afterwards. - """ - self.level.close() - self.level = None - - def reset(self): - """ - Resets the environment to its initialization state. This method needs to be called to start a - new episode after the last episode ended. - - :return: initial state - """ - self.level.reset() # optional: episode=-1, seed=None - return self.level.observations()[self.state_attribute] - - def execute(self, actions): - """ - Pass action to universe environment, return reward, next step, terminal state and - additional info. - - :param action: action to execute as numpy array, should have dtype np.intc and should adhere to - the specification given in DeepMindLabEnvironment.action_spec(level_id) - :return: dict containing the next state, the reward, and a boolean indicating if the - next state is a terminal state - """ - adjusted_actions = list() - for action_spec in self.level.action_spec(): - if action_spec['min'] == -1 and action_spec['max'] == 1: - adjusted_actions.append(actions[action_spec['name']] - 1) - else: - adjusted_actions.append(actions[action_spec['name']]) # clip? - actions = np.array(adjusted_actions, dtype=np.intc) - - reward = self.level.step(action=actions, num_steps=self.repeat_action) - state = self.level.observations()['RGB_INTERLACED'] - terminal = not self.level.is_running() - return state, terminal, reward - - @property - def states(self): - states = dict() - - for state in self.level.observation_spec(): - state_type = state['dtype'] - - if state_type == np.uint8: - state_type = np.float32 - - if state['name'] == self.state_attribute: - return dict(shape=state['shape'], type=state_type) - - return states - - @property - def actions(self): - actions = dict() - for action in self.level.action_spec(): - if action['min'] == -1 and action['max'] == 1: - actions[action['name']] = dict(type='int', num_actions=3) - else: - actions[action['name']] = dict(type='float', min_value=action['min'], max_value=action['max']) - return actions - - @property - def num_steps(self): - """ - Number of frames since the last reset() call. - """ - return self.level.num_steps() - - @property - def fps(self): - """ - An advisory metric that correlates discrete environment steps ("frames") with real - (wallclock) time: the number of frames per (real) second. - """ - return self.level.fps() diff --git a/tensorforce/contrib/maze_explorer.py b/tensorforce/contrib/maze_explorer.py deleted file mode 100644 index c2868b049..000000000 --- a/tensorforce/contrib/maze_explorer.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import mazeexp as mx - -from tensorforce.environments import Environment - -class MazeExplorer(Environment): - """ - MazeExplorer Integration: https://github.com/mryellow/maze_explorer. - """ - - def __init__(self, mode_id=0, visible=True): - """ - Initialize MazeExplorer. - - Args: - mode_id: Game mode ID. See https://github.com/mryellow/maze_explorer - visible: Show output window - """ - - self.mode_id = int(mode_id) - # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv - self.engine = mx.MazeExplorer(mode_id, visible) - - def __str__(self): - return 'MazeExplorer({})'.format(self.mode_id) - - def close(self): - self.engine = None - - def reset(self): - # TODO: Reset to `ones`? - return self.engine.reset() - - def execute(self, actions): - state, reward, terminal, _ = self.engine.act(actions) - return state, terminal, reward - - @property - def states(self): - # Use `observation_chans` to multichannel with `item` sensors. - if self.engine.observation_chans > 1: - shape = (self.engine.observation_num, self.engine.observation_chans) - else: - shape = (self.engine.observation_num,) - - return dict(shape=shape, type='float') - - @property - def actions(self): - return dict(type='int', num_actions=self.engine.actions_num) diff --git a/tensorforce/contrib/openai_gym.py b/tensorforce/contrib/openai_gym.py deleted file mode 100755 index 985c20b8f..000000000 --- a/tensorforce/contrib/openai_gym.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -OpenAI Gym Integration: https://gym.openai.com/. -""" - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import gym -import numpy as np -from tensorforce import TensorForceError -from tensorforce.environments import Environment - - -class OpenAIGym(Environment): - - def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0, visualize=False): - """ - Initialize OpenAI Gym. - - Args: - gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs - monitor: Output directory. Setting this to None disables monitoring. - monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False. - monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos. - visualize: If set True, the program will visualize the trainings of gym's environment. Note that such - visualization is probabily going to slow down the training. - """ - - self.gym_id = gym_id - self.gym = gym.make(gym_id) # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv - self.visualize = visualize - - if monitor: - if monitor_video == 0: - video_callable = False - else: - video_callable = (lambda x: x % monitor_video == 0) - self.gym = gym.wrappers.Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable) - - def __str__(self): - return 'OpenAIGym({})'.format(self.gym_id) - - def close(self): - self.gym.close() - self.gym = None - - def reset(self): - if isinstance(self.gym, gym.wrappers.Monitor): - self.gym.stats_recorder.done = True - return self.gym.reset() - - def execute(self, actions): - if self.visualize: - self.gym.render() - # if the actions is not unique, that is, if the actions is a dict - if isinstance(actions, dict): - actions = [actions['action{}'.format(n)] for n in range(len(actions))] - state, reward, terminal, _ = self.gym.step(actions) - return state, terminal, reward - - @property - def states(self): - return OpenAIGym.state_from_space(space=self.gym.observation_space) - - @staticmethod - def state_from_space(space): - if isinstance(space, gym.spaces.Discrete): - return dict(shape=(), type='int') - elif isinstance(space, gym.spaces.MultiBinary): - return dict(shape=space.n, type='int') - elif isinstance(space, gym.spaces.MultiDiscrete): - return dict(shape=space.num_discrete_space, type='int') - elif isinstance(space, gym.spaces.Box): - return dict(shape=tuple(space.shape), type='float') - elif isinstance(space, gym.spaces.Tuple): - states = dict() - n = 0 - for space in space.spaces: - state = OpenAIGym.state_from_space(space=space) - if 'type' in state: - states['state{}'.format(n)] = state - n += 1 - else: - for state in state.values(): - states['state{}'.format(n)] = state - n += 1 - return states - else: - raise TensorForceError('Unknown Gym space.') - - @property - def actions(self): - return OpenAIGym.action_from_space(space=self.gym.action_space) - - @staticmethod - def action_from_space(space): - if isinstance(space, gym.spaces.Discrete): - return dict(type='int', num_actions=space.n) - elif isinstance(space, gym.spaces.MultiBinary): - return dict(type='bool', shape=space.n) - elif isinstance(space, gym.spaces.MultiDiscrete): - if (space.low == space.low[0]).all() and (space.high == space.high[0]).all(): - return dict(type='int', num_actions=(space.high[0] - space.low[0]), shape=space.num_discrete_space) - else: - actions = dict() - for n in range(space.num_discrete_space): - actions['action{}'.format(n)] = dict(type='int', num_actions=(space.high[n] - space.low[n])) - return actions - elif isinstance(space, gym.spaces.Box): - if (space.low == space.low[0]).all() and (space.high == space.high[0]).all(): - return dict(type='float', shape=space.low.shape, - min_value=np.float32(space.low[0]), - max_value=np.float32(space.high[0])) - else: - actions = dict() - low = space.low.flatten() - high = space.high.flatten() - for n in range(low.shape[0]): - actions['action{}'.format(n)] = dict(type='float', min_value=low[n], max_value=high[n]) - return actions - elif isinstance(space, gym.spaces.Tuple): - actions = dict() - n = 0 - for space in space.spaces: - action = OpenAIGym.action_from_space(space=space) - if 'type' in action: - actions['action{}'.format(n)] = action - n += 1 - else: - for action in action.values(): - actions['action{}'.format(n)] = action - n += 1 - return actions - else: - raise TensorForceError('Unknown Gym space.') diff --git a/tensorforce/contrib/openai_universe.py b/tensorforce/contrib/openai_universe.py deleted file mode 100755 index 1e7cd2683..000000000 --- a/tensorforce/contrib/openai_universe.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import gym -import universe -from gym.spaces.discrete import Discrete -from universe.spaces import VNCActionSpace, VNCObservationSpace - -from tensorforce import TensorForceError -from tensorforce.environments.environment import Environment - - -class OpenAIUniverse(Environment): - """ - OpenAI Universe Integration: https://universe.openai.com/. - Contains OpenAI Gym: https://gym.openai.com/. - """ - - def __init__(self, env_id): - """ - Initialize OpenAI universe environment. - - Args: - env_id: string with id/descriptor of the universe environment, e.g. 'HarvestDay-v0'. - """ - self.env_id = env_id - self.env = gym.make(env_id) - - def __str__(self): - return 'OpenAI-Universe({})'.format(self.env_id) - - def close(self): - self.env = None - - def reset(self): - state = self.env.reset() - if state == [None]: - state, r, t = self._wait_state(state, None, None) - - if isinstance(state[0], dict): - # We can't handle string states right now, so omit the text state for now - state[0].pop('text', None) - - return state[0] - - def execute(self, actions): - state, terminal, reward = self._execute(actions) - return self._wait_state(state, terminal, reward) - - def _execute(self, actions): - pass_actions = [] - for action_name, value in actions.items(): - if action_name == 'key': - key_event = self._int_to_key(value) - pass_actions.append(key_event) - elif action_name == 'button': - btn_event = self._int_to_btn(value) - x, y = self._int_to_pos(actions.get('position', 0)) - pass_actions.append(universe.spaces.PointerEvent(x, y, btn_event)) - - state, reward, terminal, _ = self.env.step([pass_actions]) - - if isinstance(state[0], dict): - # We can't handle string states right now, so omit the text state for now - state[0].pop('text', None) - - return state[0], terminal[0], reward[0] - - def _int_to_pos(self, flat_position): - """Returns x, y from flat_position integer. - - Args: - flat_position: flattened position integer - - Returns: x, y - - """ - return flat_position % self.env.action_space.screen_shape[0],\ - flat_position % self.env.action_space.screen_shape[1] - - def _key_to_int(self, key_event): - return self.env.action_space.keys.index(key_event) - - def _int_to_key(self, key_value): - return self.env.action_space.keys[key_value] - - def _btn_to_int(self, btn_event): - return self.env.action_space.buttonmasks.index(btn_event) - - def _int_to_btn(self, btn_value): - return self.env.action_space.buttonmasks[btn_value] - - def _wait_state(self, state, reward, terminal): - """ - Wait until there is a state. - """ - while state == [None] or not state: - state, terminal, reward = self._execute(dict(key=0)) - - return state, terminal, reward - - def configure(self, *args, **kwargs): - self.env.configure(*args, **kwargs) - - def render(self, *args, **kwargs): - self.env.render(*args, **kwargs) - - @property - def states(self): - print(self.env.observation_space) - if isinstance(self.env.observation_space, VNCObservationSpace): - return dict( - # VNCObeservationSpace seems to be hardcoded to 1024x768 - vision=dict(type='float', shape=(768, 1024, 3)) - # vision = dict(type=float, shape=(self.env.action_space.screen_shape[1], - # self.env.action_space.screen_shape[0], 3)) - # text=dict(type=str, shape=(1,)) # TODO: implement string states - ) - elif isinstance(self.env.observation_space, Discrete): - return dict(shape=(), type='float') - else: - return dict(shape=tuple(self.env.observation_space.shape), type='float') - - @property - def actions(self): - if isinstance(self.env.action_space, VNCActionSpace): - return dict( - key=dict(type='int', num_actions=len(self.env.action_space.keys)), - button=dict(type='int', num_actions=len(self.env.action_space.buttonmasks)), - position=dict( - type='int', - num_actions=self.env.action_space.screen_shape[0] * self.env.action_space.screen_shape[1] - ) - ) - elif isinstance(self.env.action_space, Discrete): - return dict(type='int', num_actions=self.env.action_space.n) - elif len(self.env.action_space.shape) == 1: - return {'action' + str(n): dict(type='float') for n in range(len(self.env.action_space.shape[0]))} - else: - raise TensorForceError() diff --git a/tensorforce/contrib/remote_environment.py b/tensorforce/contrib/remote_environment.py deleted file mode 100644 index 7df56403a..000000000 --- a/tensorforce/contrib/remote_environment.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from tensorforce.environments import Environment -import socket -import msgpack -import msgpack_numpy as mnp -import errno -import os -from tensorforce import TensorForceError -import logging -import time - - -class RemoteEnvironment(Environment): - def __init__(self, host="localhost", port=6025): - """ - A remote Environment that one can connect to through tcp. - Implements a simple msgpack protocol to get the step/reset/etc.. commands to the - remote server and simply waits (blocks) for a response. - - Args: - host (str): The hostname to connect to. - port (int): The port to connect to. - """ - Environment.__init__(self) - self.port = int(port) or 6025 - self.host = host or "localhost" - self.socket = None - # The size of the response buffer (depends on the Env's observation-space). - self.buffer_size = 8192 - - # Cache the last received observation (through socket) here. - self.last_observation = None - - def __str__(self): - return "RemoteEnvironment({}:{}{})".format(self.host, self.port, " [connected]" if self.socket else "") - - def close(self): - """ - Same as disconnect method. - """ - self.disconnect() - - def connect(self, timeout=600): - """ - Starts the server tcp connection on the given host:port. - - Args: - timeout (int): The time (in seconds) for which we will attempt a connection to the remote - (every 5sec). After that (or if timeout is None or 0), an error is raised. - """ - # If we are already connected, return error. - if self.socket: - raise TensorForceError("Already connected to {}:{}. Only one connection allowed at a time. " + - "Close first by calling `close`!".format(self.host, self.port)) - self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - - if timeout < 5 or timeout is None: - timeout = 5 - - err = 0 - start_time = time.time() - while time.time() - start_time < timeout: - self.socket.settimeout(5) - err = self.socket.connect_ex((self.host, self.port)) - if err == 0: - break - time.sleep(1) - if err != 0: - raise TensorForceError("Error when trying to connect to {}:{}: errno={} errcode='{}' '{}'". - format(self.host, self.port, err, errno.errorcode[err], os.strerror(err))) - - def disconnect(self): - """ - Ends our server tcp connection. - """ - # If we are not connected, return error. - if not self.socket: - logging.warning("No active socket to close!") - return - # Close our socket. - self.socket.close() - self.socket = None - - @property - def current_state(self): - return self.last_observation - - -class MsgPackNumpyProtocol(object): - """ - A simple protocol to communicate over tcp sockets, which can be used by RemoteEnvironment implementations. - The protocol is based on msgpack-numpy encoding and decoding. - - Each message has a simple 8-byte header, which encodes the length of the subsequent msgpack-numpy - encoded byte-string. - All messages received need to have the 'status' field set to 'ok'. If 'status' is set to 'error', - the field 'message' should be populated with some error information. - - Examples: - client sends: "[8-byte header]msgpack-encoded({"cmd": "seed", "value": 200})" - server responds: "[8-byte header]msgpack-encoded({"status": "ok", "value": 200})" - - client sends: "[8-byte header]msgpack-encoded({"cmd": "reset"})" - server responds: "[8-byte header]msgpack-encoded({"status": "ok"})" - - client sends: "[8-byte header]msgpack-encoded({"cmd": "step", "action": 5})" - server responds: "[8-byte header]msgpack-encoded({"status": "ok", "obs_dict": {... some observations}, - "reward": -10.0, "is_terminal": False})" - """ - def __init__(self, max_msg_len=8192): - """ - Args: - max_msg_len (int): The maximum number of bytes to read from the socket. - """ - self.max_msg_len = max_msg_len - # Make all msgpack methods use the numpy-aware de/encoders. - mnp.patch() - - def send(self, message, socket_): - """ - Sends a message (dict) to the socket. Message consists of a 8-byte len header followed by a msgpack-numpy - encoded dict. - - Args: - message: The message dict (e.g. {"cmd": "reset"}) - socket_: The python socket object to use. - """ - if not socket_: - raise TensorForceError("No socket given in call to `send`!") - elif not isinstance(message, dict): - raise TensorForceError("Message to be sent must be a dict!") - message = msgpack.packb(message) - len_ = len(message) - # prepend 8-byte len field to all our messages - socket_.send(bytes("{:08d}".format(len_), encoding="ascii") + message) - - def recv(self, socket_): - """ - Receives a message as msgpack-numpy encoded byte-string from the given socket object. - Blocks until something was received. - - Args: - socket_: The python socket object to use. - Returns: The decoded (as dict) message received. - """ - unpacker = msgpack.Unpacker(encoding="utf-8") - - # Wait for an immediate response. - response = socket_.recv(8) # get the length of the message - if response == b"": - raise TensorForceError("No data received by socket.recv in call to method `recv` " + - "(listener possibly closed)!") - orig_len = int(response) - received_len = 0 - while True: - data = socket_.recv(min(orig_len - received_len, self.max_msg_len)) - # There must be a response. - if not data: - raise TensorForceError("No data of len {} received by socket.recv in call to method `recv`!". - format(orig_len - received_len)) - data_len = len(data) - received_len += data_len - unpacker.feed(data) - - if received_len == orig_len: - break - - # Get the data. - for message in unpacker: - if "status" in message: - if message["status"] == "ok": - return message - else: - raise TensorForceError("RemoteEnvironment server error: {}". - format(message.get("message", "not specified"))) - else: - raise TensorForceError("Message without field 'status' received!") - raise TensorForceError("No message encoded in data stream (data stream had len={})". - format(orig_len)) - diff --git a/tensorforce/contrib/state_settable_environment.py b/tensorforce/contrib/state_settable_environment.py deleted file mode 100644 index dc679c808..000000000 --- a/tensorforce/contrib/state_settable_environment.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorforce.environments import Environment - - -class StateSettableEnvironment(Environment): - """ - An Environment that implements the set_state method to set the current state - to some new state using setter instructions. - """ - def set_state(self, **kwargs): - """ - Sets the current state of the environment manually to some other state and returns a new observation. - - Args: - **kwargs: The set instruction(s) to be executed by the environment. - A single set instruction usually set a single property of the - state/observation vector to some new value. - Returns: The observation dictionary of the Environment after(!) setting it to the new state. - """ - raise NotImplementedError - diff --git a/tensorforce/contrib/unreal_engine.py b/tensorforce/contrib/unreal_engine.py deleted file mode 100644 index 12b0f4863..000000000 --- a/tensorforce/contrib/unreal_engine.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from tensorforce.contrib.remote_environment import RemoteEnvironment, MsgPackNumpyProtocol -from tensorforce.contrib.state_settable_environment import StateSettableEnvironment -from tensorforce import TensorForceError -from cached_property import cached_property -import re -import time -import itertools -import logging - - -class UE4Environment(RemoteEnvironment, StateSettableEnvironment): - """ - A special RemoteEnvironment for UE4 game connections. - Communicates with the remote to receive information on the definitions of action- and observation spaces. - Sends UE4 Action- and Axis-mappings as RL-actions and receives observations back defined by MLObserver - objects placed in the Game - (these could be camera pixels or other observations, e.g. a x/y/z position of some game actor). - """ - def __init__( - self, - host="localhost", - port=6025, - connect=True, - discretize_actions=False, - delta_time=1/60, - num_ticks=4 - ): - """ - Args: - host (str): The hostname to connect to. - port (int): The port to connect to. - connect (bool): Whether to connect already in this c'tor. - discretize_actions (bool): Whether to treat axis-mappings defined in UE4 game as discrete actions. - This would be necessary e.g. for agents that use q-networks where the output are q-values per discrete - state-action pair. - delta_time (float): The fake delta time to use for each single game tick. - num_ticks (int): The number of ticks to be executed in a single act call (each tick will - repeat the same given actions). - """ - RemoteEnvironment.__init__(self, host, port) - - # RemoteEnvironment should send a name of the game upon connection. - self.game_name = None - self.action_space_desc = None - self.observation_space_desc = None - - self.discretize_actions = discretize_actions - self.discretized_actions = None - self.delta_time = delta_time - self.num_ticks = num_ticks - - # Our tcp messaging protocol to use (simple len-header + msgpack-numpy-body). - self.protocol = MsgPackNumpyProtocol() - - if connect: - self.connect() - - def __str__(self): - return "UE4Environment({}:{}{})".format(self.host, self.port, "[connected; {}]". - format(self.game_name) if self.socket else "") - - def connect(self, timeout=600): - RemoteEnvironment.connect(self, timeout) - - # Get action- and state-specs from our game. - self.protocol.send({"cmd": "get_spec"}, self.socket) - response = self.protocol.recv(self.socket) - - if "observation_space_desc" not in response or "action_space_desc" not in response: - raise TensorForceError("ERROR in UE4Environment.connect: no observation- or action-space-desc sent " - "by remote server!") - - # Game's name - self.game_name = response.get("game_name") # keep non-mandatory for now - # Observers - if "observation_space_desc" not in response: - raise TensorForceError("Response to `get_spec` does not contain field `observation_space_desc`!") - self.observation_space_desc = response["observation_space_desc"] - # Action-mappings - if "action_space_desc" not in response: - raise TensorForceError("Response to `get_spec` does not contain field `action_space_desc`!") - self.action_space_desc = response["action_space_desc"] - - if self.discretize_actions: - self.discretize_action_space_desc() - - # Invalidate our states- and actions caches. - if "states" in self.__dict__: - del self.__dict__["states"] - if "actions" in self.__dict__: - del self.__dict__["actions"] - - def seed(self, seed=None): - if not seed: - seed = time.time() - # Send command. - self.protocol.send({"cmd": "seed", "value": int(seed)}, self.socket) - # Wait for response. - response = self.protocol.recv(self.socket) - if "status" not in response: - raise TensorForceError("Message without field 'status' received!") - elif response["status"] != "ok": - raise TensorForceError("Message 'status' for seed command is not 'ok' ({})!".format(response["status"])) - return seed - - def reset(self): - """ - same as step (no kwargs to pass), but needs to block and return observation_dict - - stores the received observation in self.last_observation - """ - # Send command. - self.protocol.send({"cmd": "reset"}, self.socket) - # Wait for response. - response = self.protocol.recv(self.socket) - # Extract observations. - return self.extract_observation(response) - - def set_state(self, setters, **kwargs): - if "cmd" in kwargs: - raise TensorForceError("Key 'cmd' must not be present in **kwargs to method `set`!") - - # Forward kwargs to remote (only add command: set). - message = kwargs - message["cmd"] = "set" - - # Sanity check given setters. - # Solve single tuple with prop-name and value -> should become a list (len=1) of this tuple. - if len(setters) >= 2 and not isinstance(setters[1], (list, tuple)): - setters = list((setters,)) - for set_cmd in setters: - if not re.match(r'\w+(:\w+)*', set_cmd[0]): - raise TensorForceError("ERROR: property ({}) in setter-command does not match correct pattern!". - format(set_cmd[0])) - if len(set_cmd) == 3 and not isinstance(set_cmd[2], bool): - raise TensorForceError("ERROR: 3rd item in setter-command must be of type bool ('is_relative' flag)!") - message["setters"] = setters - - self.protocol.send(message, self.socket) - # Wait for response. - response = self.protocol.recv(self.socket) - return self.extract_observation(response) - - def execute(self, actions): - """ - Executes a single step in the UE4 game. This step may be comprised of one or more actual game ticks for all of - which the same given - action- and axis-inputs (or action number in case of discretized actions) are repeated. - UE4 distinguishes between action-mappings, which are boolean actions (e.g. jump or dont-jump) and axis-mappings, - which are continuous actions - like MoveForward with values between -1.0 (run backwards) and 1.0 (run forwards), 0.0 would mean: stop. - """ - action_mappings, axis_mappings = [], [] - - # TODO: what if more than one actions are passed? - - # Discretized -> each action is an int - if self.discretize_actions: - # Pull record from discretized_actions, which will look like: [A, Right, SpaceBar]. - combination = self.discretized_actions[actions] - # Translate to {"axis_mappings": [('A', 1.0), (Right, 1.0)], "action_mappings": [(SpaceBar, True)]} - for key, value in combination: - # Action mapping (True or False). - if isinstance(value, bool): - action_mappings.append((key, value)) - # Axis mapping: always use 1.0 as value as UE4 already multiplies with the correct scaling factor. - else: - axis_mappings.append((key, value)) - # Non-discretized: Each action is a dict of action- and axis-mappings defined in UE4 game's input settings. - # Re-translate Incoming action names into keyboard keys for the server. - elif actions: - try: - action_mappings, axis_mappings = self.translate_abstract_actions_to_keys(actions) - except KeyError as e: - raise TensorForceError("Action- or axis-mapping with name '{}' not defined in connected UE4 game!". - format(e)) - - # message = {"cmd": "step", 'delta_time': 0.33, - # 'actions': [('X', True), ('Y', False)], - # 'axes': [('Left': 1.0), ('Up': -1.0)] - # } - message = dict( - cmd="step", - delta_time=self.delta_time, - num_ticks=self.num_ticks, - actions=action_mappings, - axes=axis_mappings - ) - self.protocol.send(message, self.socket) - # Wait for response (blocks). - response = self.protocol.recv(self.socket) - r = response.pop("_reward", 0.0) - is_terminal = response.pop("_is_terminal", False) - - obs = self.extract_observation(response) - # Cache last observation - self.last_observation = obs - return obs, is_terminal, r - - @cached_property - def states(self): - observation_space = {} - # Derive observation space from observation_space_desc. - if self.observation_space_desc: - for key, desc in self.observation_space_desc.items(): - type_ = desc["type"] - if type_ == "Bool": - space = dict(type="float", shape=()) - elif type_ == "IntBox": - space = dict( - type="float", - shape=desc.get("shape", ()), - min_value=desc.get("min", None), - max_value=desc.get("max", None) - ) - elif type_ == "Continuous": - space = dict( - type="float", - shape=desc.get("shape", ()), - min_value=desc.get("min", None), - max_value=desc.get("max", None) - ) - # TODO: Enums - else: - raise TensorForceError("Unsupported space type {} coming from Environment (" - "observation_space_desc)!".format(type_)) - - observation_space[key] = space - # Simplest case: if only one observer -> use that one. - if len(observation_space) == 1: - observation_space = list(observation_space.values())[0] - return observation_space - - @cached_property - def actions(self): - # Derive action space from action_space_desc. - if not self.action_space_desc: - return {} - - # Discretize all mappings. Pretend that each single mapping and combination thereof is its own discrete action. - # E.g. MoveForward=Up(1.0)+Down(-1.0) MoveRight=Right(1.0)+Left(-1.0) -> UpRight, UpLeft, Right, Left, Up, Down, - # DownRight, DownLeft, Idle - if self.discretize_actions: - return dict(type="int", num_actions=len(self.discretized_actions)) - # Leave each mapping as independent action, which may be continuous and can be combined with all other actions - # in any way. - else: - action_space = {} - for action_name, properties in self.action_space_desc.items(): - # UE4 action mapping -> bool - if properties["type"] == "action": - action_space[action_name] = dict(type="int", num_actions=2) - # UE4 axis mapping -> continuous (float) unless we have discretized axes - else: - min_ = 0.0 - max_ = 0.0 - for mapping in properties["keys"]: - if mapping[1] > max_: - max_ = mapping[1] - if mapping[1] < min_: - min_ = mapping[1] - action_space[action_name] = dict(type="float", shape=(), min_value=min_, max_value=max_) - return action_space - - def translate_abstract_actions_to_keys(self, abstract): - """ - Translates a list of tuples ([pretty mapping], [value]) to a list of tuples ([some key], [translated value]) - each single item in abstract will undergo the following translation: - - Example1: - we want: "MoveRight": 5.0 - possible keys for the action are: ("Right", 1.0), ("Left", -1.0) - result: "Right": 5.0 * 1.0 = 5.0 - - Example2: - we want: "MoveRight": -0.5 - possible keys for the action are: ("Left", -1.0), ("Right", 1.0) - result: "Left": -0.5 * -1.0 = 0.5 (same as "Right": -0.5) - """ - - # Solve single tuple with name and value -> should become a list (len=1) of this tuple. - if len(abstract) >= 2 and not isinstance(abstract[1], (list, tuple)): - abstract = list((abstract,)) - - # Now go through the list and translate each axis into an actual keyboard key (or mouse event/etc..). - actions, axes = [], [] - for a in abstract: - # first_key = key-name (action mapping or discretized axis mapping) OR tuple (key-name, scale) (continuous - # axis mapping) - first_key = self.action_space_desc[a[0]]["keys"][0] - # action mapping - if isinstance(first_key, (bytes, str)): - actions.append((first_key, a[1])) - # axis mapping - elif isinstance(first_key, tuple): - axes.append((first_key[0], a[1] * first_key[1])) - else: - raise TensorForceError("action_space_desc contains unsupported type for key {}!".format(a[0])) - - return actions, axes - - def discretize_action_space_desc(self): - """ - Creates a list of discrete action(-combinations) in case we want to learn with a discrete set of actions, - but only have action-combinations (maybe even continuous) available from the env. - E.g. the UE4 game has the following action/axis-mappings: - - ```javascript - { - 'Fire': - {'type': 'action', 'keys': ('SpaceBar',)}, - 'MoveRight': - {'type': 'axis', 'keys': (('Right', 1.0), ('Left', -1.0), ('A', -1.0), ('D', 1.0))}, - } - ``` - - -> this method will discretize them into the following 6 discrete actions: - - ```javascript - [ - [(Right, 0.0),(SpaceBar, False)], - [(Right, 0.0),(SpaceBar, True)] - [(Right, -1.0),(SpaceBar, False)], - [(Right, -1.0),(SpaceBar, True)], - [(Right, 1.0),(SpaceBar, False)], - [(Right, 1.0),(SpaceBar, True)], - ] - ``` - - """ - # Put all unique_keys lists in one list and itertools.product that list. - unique_list = [] - for nice, record in self.action_space_desc.items(): - list_for_record = [] - if record["type"] == "axis": - # The main key for this record (always the first one) - head_key = record["keys"][0][0] - # The reference value (divide by this one to get the others) - head_value = record["keys"][0][1] - # The zero key (idle action; axis scale=0.0) - list_for_record.append((head_key, 0.0)) - set_ = set() - for key_and_scale in self.action_space_desc[nice]["keys"]: - # Build unique lists of mappings (each axis value should only be represented once). - if key_and_scale[1] not in set_: - list_for_record.append((head_key, key_and_scale[1] / head_value)) - set_.add(key_and_scale[1]) - else: - # Action-mapping - list_for_record = [(record["keys"][0], False), (record["keys"][0], True)] - unique_list.append(list_for_record) - - def so(in_): - # in_ is List[Tuple[str,any]] -> sort by concat'd sequence of str(any's) - st = "" - for i in in_: - st += str(i[1]) - return st - - # Then sort and get the entire list of all possible sorted meaningful key-combinations. - combinations = list(itertools.product(*unique_list)) - combinations = list(map(lambda x: sorted(list(x), key=lambda y: y[0]), combinations)) - combinations = sorted(combinations, key=so) - # Store that list as discretized_actions. - self.discretized_actions = combinations - - @staticmethod - def extract_observation(message): - if "obs_dict" not in message: - raise TensorForceError("Message without field 'obs_dict' received!") - - ret = message["obs_dict"] - # Only one observer -> use that one (no dict of dicts). - if len(ret) == 1: - ret = list(ret.values())[0] - return ret - diff --git a/tensorforce/core/__init__.py b/tensorforce/core/__init__.py index 0cf2dc5b2..dd3617981 100755 --- a/tensorforce/core/__init__.py +++ b/tensorforce/core/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,3 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== + +# utils +from tensorforce.core.utils import ArrayDict, ListDict, ModuleDict, NestedDict, SignatureDict, \ + TensorDict, TensorSpec, TensorsSpec, tf_util, VariableDict + +# Basics +from tensorforce.core.config import TensorforceConfig +from tensorforce.core.module import Module, tf_function # TODO: part of Module +from tensorforce.core.parameters import parameter_modules + +# Require parameter_modules +from tensorforce.core.layers import layer_modules +from tensorforce.core.memories import memory_modules +from tensorforce.core.objectives import objective_modules +from tensorforce.core.optimizers import optimizer_modules + +# Require layer_modules +from tensorforce.core.distributions import distribution_modules +from tensorforce.core.networks import network_modules + +# Require network_modules +from tensorforce.core.policies import policy_modules + + +__all__ = [ + 'distribution_modules', 'layer_modules', 'memory_modules', 'Module', 'network_modules', + 'objective_modules', 'optimizer_modules', 'parameter_modules', 'policy_modules', 'tf_function' +] diff --git a/tensorforce/core/baselines/aggregated_baseline.py b/tensorforce/core/baselines/aggregated_baseline.py deleted file mode 100755 index 5168bd1ee..000000000 --- a/tensorforce/core/baselines/aggregated_baseline.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce.core.networks import Linear -from tensorforce.core.baselines import Baseline - - -class AggregatedBaseline(Baseline): - """ - Baseline which aggregates per-state baselines. - """ - - def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): - """ - Aggregated baseline. - - Args: - baselines: Dict of per-state baseline specification dicts - """ - - self.baselines = dict() - for name, baseline in baselines.items(): - self.baselines[name] = Baseline.from_spec( - spec=baseline, - kwargs=dict(summary_labels=summary_labels)) - - self.linear = Linear(size=1, bias=0.0, scope='prediction') - - super(AggregatedBaseline, self).__init__(scope, summary_labels) - - def tf_predict(self, states, internals, update): - predictions = list() - for name, state in states.items(): - prediction = self.baselines[name].predict(states=state, internals=internals, update=update) - predictions.append(prediction) - predictions = tf.stack(values=predictions, axis=1) - prediction = self.linear.apply(x=predictions) - return tf.squeeze(input=prediction, axis=1) - - def tf_regularization_loss(self): - regularization_loss = super(AggregatedBaseline, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - for baseline in self.baselines.values(): - regularization_loss = baseline.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - regularization_loss = self.linear.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - baseline_variables = super(AggregatedBaseline, self).get_variables(include_nontrainable=include_nontrainable) - baselines_variables = [ - variable for name in sorted(self.baselines) - for variable in self.baselines[name].get_variables(include_nontrainable=include_nontrainable) - ] - linear_variables = self.linear.get_variables(include_nontrainable=include_nontrainable) - - return baseline_variables + baselines_variables + linear_variables - - def get_summaries(self): - baseline_summaries = super(AggregatedBaseline, self).get_summaries() - baselines_summaries = [ - variable for name in sorted(self.baselines) - for variable in self.baselines[name].get_summaries() - ] - linear_summaries = self.linear.get_summaries() - - return baseline_summaries + baselines_summaries + linear_summaries diff --git a/tensorforce/core/baselines/baseline.py b/tensorforce/core/baselines/baseline.py deleted file mode 100755 index 5c918ad24..000000000 --- a/tensorforce/core/baselines/baseline.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util -import tensorforce.core.baselines - - -class Baseline(object): - """ - Base class for baseline value functions. - """ - - def __init__(self, scope='baseline', summary_labels=None): - """ - Baseline. - """ - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.all_variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.all_variables[name] = variable - if kwargs.get('trainable', True): - self.variables[name] = variable - if 'variables' in self.summary_labels: - summary = tf.summary.histogram(name=name, values=variable) - self.summaries.append(summary) - return variable - - self.predict = tf.make_template( - name_=(scope + '/predict'), - func_=self.tf_predict, - custom_getter_=custom_getter - ) - self.reference = tf.make_template( - name_=(scope + '/reference'), - func_=self.tf_reference, - custom_getter_=custom_getter - ) - self.loss = tf.make_template( - name_=(scope + '/loss'), - func_=self.tf_loss, - custom_getter_=custom_getter - ) - self.regularization_loss = tf.make_template( - name_=(scope + '/regularization-loss'), - func_=self.tf_regularization_loss, - custom_getter_=custom_getter - ) - - def tf_predict(self, states, internals, update): - """ - Creates the TensorFlow operations for predicting the value function of given states. - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - update: Boolean tensor indicating whether this call happens during an update. - Returns: - State value tensor - """ - raise NotImplementedError - - def tf_reference(self, states, internals, reward, update): - """ - Creates the TensorFlow operations for obtaining the reference tensor(s), in case of a - comparative loss. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - reward: Reward tensor. - update: Boolean tensor indicating whether this call happens during an update. - - Returns: - Reference tensor(s). - """ - return None - - def tf_loss(self, states, internals, reward, update, reference=None): - """ - Creates the TensorFlow operations for calculating the L2 loss between predicted - state values and actual rewards. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - reward: Reward tensor. - update: Boolean tensor indicating whether this call happens during an update. - reference: Optional reference tensor(s), in case of a comparative loss. - - Returns: - Loss tensor - """ - prediction = self.predict(states=states, internals=internals, update=update) - return tf.nn.l2_loss(t=(prediction - reward)) - - def tf_regularization_loss(self): - """ - Creates the TensorFlow operations for the baseline regularization loss/ - - Returns: - Regularization loss tensor - """ - return None - - def get_variables(self, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the baseline. - - Returns: - List of variables - """ - if include_nontrainable: - return [self.all_variables[key] for key in sorted(self.all_variables)] - else: - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the baseline - - Returns: - List of summaries - """ - return self.summaries - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a baseline from a specification dict. - """ - baseline = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.baselines.baselines, - kwargs=kwargs - ) - assert isinstance(baseline, Baseline) - return baseline diff --git a/tensorforce/core/baselines/cnn_baseline.py b/tensorforce/core/baselines/cnn_baseline.py deleted file mode 100755 index f745822e6..000000000 --- a/tensorforce/core/baselines/cnn_baseline.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.core.baselines import NetworkBaseline - - -class CNNBaseline(NetworkBaseline): - """ - CNN baseline (single-state) consisting of convolutional layers followed by dense layers. - """ - - def __init__(self, conv_sizes, dense_sizes, scope='cnn-baseline', summary_labels=()): - """ - CNN baseline. - - Args: - conv_sizes: List of convolutional layer sizes - dense_sizes: List of dense layer sizes - """ - - network = [] - for size in conv_sizes: - network.append(dict(type='conv2d', size=size)) - - # First layer has a larger window. - network[0]['window'] = 5 - - network.append(dict(type='flatten')) # TODO: change to max pooling! - for size in dense_sizes: - network.append(dict(type='dense', size=size)) - - super(CNNBaseline, self).__init__(network=network, scope=scope, summary_labels=summary_labels) diff --git a/tensorforce/core/baselines/mlp_baseline.py b/tensorforce/core/baselines/mlp_baseline.py deleted file mode 100755 index 25e476f9c..000000000 --- a/tensorforce/core/baselines/mlp_baseline.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.core.baselines import NetworkBaseline - - -class MLPBaseline(NetworkBaseline): - """ - Multi-layer perceptron baseline (single-state) consisting of dense layers. - """ - - def __init__(self, sizes, scope='mlp-baseline', summary_labels=()): - """ - Multi-layer perceptron baseline. - - Args: - sizes: List of dense layer sizes - """ - - network = [] - for size in sizes: - network.append(dict(type='dense', size=size)) - - super(MLPBaseline, self).__init__(network=network, scope=scope, summary_labels=summary_labels) diff --git a/tensorforce/core/baselines/network_baseline.py b/tensorforce/core/baselines/network_baseline.py deleted file mode 100755 index 990139b90..000000000 --- a/tensorforce/core/baselines/network_baseline.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce.core.networks import Linear, Network -from tensorforce.core.baselines import Baseline - - -class NetworkBaseline(Baseline): - """ - Baseline based on a TensorForce network, used when parameters are shared between the value - function and the baseline. - """ - - def __init__(self, network, scope='network-baseline', summary_labels=()): - """ - Network baseline. - - Args: - network_spec: Network specification dict - """ - self.network = Network.from_spec( - spec=network, - kwargs=dict(summary_labels=summary_labels) - ) - assert len(self.network.internals_spec()) == 0 - - self.linear = Linear(size=1, bias=0.0, scope='prediction') - - super(NetworkBaseline, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_predict(self, states, internals, update): - embedding = self.network.apply(x=states, internals=internals, update=update) - prediction = self.linear.apply(x=embedding) - return tf.squeeze(input=prediction, axis=1) - - def tf_regularization_loss(self): - regularization_loss = super(NetworkBaseline, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - regularization_loss = self.network.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - regularization_loss = self.linear.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - baseline_variables = super(NetworkBaseline, self).get_variables(include_nontrainable=include_nontrainable) - network_variables = self.network.get_variables(include_nontrainable=include_nontrainable) - layer_variables = self.linear.get_variables(include_nontrainable=include_nontrainable) - - return baseline_variables + network_variables + layer_variables - - def get_summaries(self): - baseline_summaries = super(NetworkBaseline, self).get_summaries() - network_summaries = self.network.get_summaries() - layer_summaries = self.linear.get_summaries() - - return baseline_summaries + network_summaries + layer_summaries diff --git a/tensorforce/core/config.py b/tensorforce/core/config.py new file mode 100644 index 000000000..4dcab95df --- /dev/null +++ b/tensorforce/core/config.py @@ -0,0 +1,67 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +class TensorforceConfig(object): + + # modify dtype mappings + + def __init__( + self, *, + buffer_observe=False, + create_debug_assertions=False, + create_tf_assertions=True, + device='CPU', + eager_mode=False, + enable_int_action_masking=True, + name='agent', + seed=None, + tf_log_level=40 + ): + assert buffer_observe is False or buffer_observe == 'episode' or \ + isinstance(buffer_observe, int) and buffer_observe >= 1 + if buffer_observe is False: + buffer_observe = 1 + super().__setattr__('buffer_observe', buffer_observe) + + assert isinstance(create_debug_assertions, bool) + super().__setattr__('create_debug_assertions', create_debug_assertions) + + assert isinstance(create_tf_assertions, bool) + super().__setattr__('create_tf_assertions', create_tf_assertions) + + assert isinstance(eager_mode, bool) + super().__setattr__('eager_mode', eager_mode) + + assert isinstance(enable_int_action_masking, bool) + super().__setattr__('enable_int_action_masking', enable_int_action_masking) + + assert device is None or isinstance(device, str) # more specific? + super().__setattr__('device', device) + + assert isinstance(name, str) + super().__setattr__('name', name) + + assert seed is None or isinstance(seed, int) + super().__setattr__('seed', seed) + + assert isinstance(tf_log_level, int) and tf_log_level >= 0 + super().__setattr__('tf_log_level', tf_log_level) + + def __setattr__(self, name, value): + raise NotImplementedError + + def __delattr__(self, name): + raise NotImplementedError diff --git a/tensorforce/core/distributions/__init__.py b/tensorforce/core/distributions/__init__.py index b8e34d42e..4f1e28e35 100755 --- a/tensorforce/core/distributions/__init__.py +++ b/tensorforce/core/distributions/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,25 +14,16 @@ # ============================================================================== from tensorforce.core.distributions.distribution import Distribution + from tensorforce.core.distributions.bernoulli import Bernoulli +from tensorforce.core.distributions.beta import Beta from tensorforce.core.distributions.categorical import Categorical from tensorforce.core.distributions.gaussian import Gaussian -from tensorforce.core.distributions.beta import Beta -distributions = dict( - bernoulli=Bernoulli, - categorical=Categorical, - gaussian=Gaussian, - beta=Beta +distribution_modules = dict( + bernoulli=Bernoulli, beta=Beta, categorical=Categorical, gaussian=Gaussian ) -__all__ = [ - 'distributions', - 'Distribution', - 'Bernoulli', - 'Categorical', - 'Gaussian', - 'Beta' -] +__all__ = ['Bernoulli', 'Beta', 'Categorical', 'Distribution', 'distribution_modules', 'Gaussian'] diff --git a/tensorforce/core/distributions/bernoulli.py b/tensorforce/core/distributions/bernoulli.py index 7cca4f6b3..351b8efb5 100755 --- a/tensorforce/core/distributions/bernoulli.py +++ b/tensorforce/core/distributions/bernoulli.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,129 +13,219 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from math import log import tensorflow as tf -from tensorforce import util -from tensorforce.core.networks import Linear +from tensorforce import TensorforceError, util +from tensorforce.core import layer_modules, TensorDict, TensorSpec, TensorsSpec, tf_function, \ + tf_util from tensorforce.core.distributions import Distribution class Bernoulli(Distribution): """ - Bernoulli distribution, for binary boolean actions. + Bernoulli distribution, for binary boolean actions (specification key: `bernoulli`). + + Args: + name (string): internal use. + action_spec (specification): internal use. + input_spec (specification): internal use. """ - def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): - """ - Bernoulli distribution. + def __init__(self, *, name=None, action_spec=None, input_spec=None): + assert action_spec.type == 'bool' - Args: - shape: Action shape. - probability: Optional distribution bias. - """ - self.shape = shape - action_size = util.prod(self.shape) + parameters_spec = TensorsSpec( + true_logit=TensorSpec(type='float', shape=action_spec.shape), + false_logit=TensorSpec(type='float', shape=action_spec.shape), + probability=TensorSpec(type='float', shape=action_spec.shape), + state_value=TensorSpec(type='float', shape=action_spec.shape) + ) + conditions_spec = TensorsSpec() - self.logit = Linear(size=action_size, bias=log(probability), scope='logit') + super().__init__( + name=name, action_spec=action_spec, input_spec=input_spec, + parameters_spec=parameters_spec, conditions_spec=conditions_spec + ) - super(Bernoulli, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) + if self.input_spec.rank == 1: + # Single embedding + action_size = util.product(xs=self.action_spec.shape, empty=0) + self.logit = self.submodule( + name='logit', module='linear', modules=layer_modules, size=action_size, + initialization_scale=0.01, input_spec=self.input_spec + ) - def tf_parameterize(self, x): - # Flat logit + else: + # Embedding per action + if self.input_spec.rank < 1 or self.input_spec.rank > 3: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='invalid rank' + ) + if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: + size = self.action_spec.shape[-1] + elif self.input_spec.shape[:-1] == self.action_spec.shape: + size = 0 + else: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='not flattened and incompatible with action shape' + ) + self.logit = self.submodule( + name='logit', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + + def get_architecture(self): + return 'Logit: {}'.format(self.logit.get_architecture()) + + def initialize(self): + super().initialize() + + name = 'distributions/' + self.name + '-probability' + self.register_summary(label='distribution', name=name) + + spec = self.parameters_spec['probability'] + self.register_tracking(label='distribution', name='probability', spec=spec) + + @tf_function(num_args=2) + def parametrize(self, *, x, conditions): + one = tf_util.constant(value=1.0, dtype='float') + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + shape = (-1,) + self.action_spec.shape + + # Logit logit = self.logit.apply(x=x) + if self.input_spec.rank == 1: + logit = tf.reshape(tensor=logit, shape=shape) - # Reshape logit to action shape - shape = (-1,) + self.shape - logit = tf.reshape(tensor=logit, shape=shape) - - # TODO rename + # States value state_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) - # Min epsilon probability for numerical stability - probability = tf.clip_by_value( - t=probability, - clip_value_min=util.epsilon, - clip_value_max=(1.0 - util.epsilon) + # "Normalized" logits + true_logit = tf.math.log(x=(probability + epsilon)) + false_logit = tf.math.log(x=(one - probability + epsilon)) + + return TensorDict( + true_logit=true_logit, false_logit=false_logit, probability=probability, + state_value=state_value ) - # "Normalized" logits - true_logit = tf.log(x=probability) - false_logit = tf.log(x=(1.0 - probability)) + @tf_function(num_args=1) + def mode(self, *, parameters, independent): + probability = parameters['probability'] - return true_logit, false_logit, probability, state_value + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + axis = range(self.action_spec.rank + 1) + return tf.math.reduce_mean(input_tensor=probability, axis=axis) - def state_value(self, distr_params): - _, _, _, state_value = distr_params - return state_value + name = 'distributions/' + self.name + '-probability' + dependencies.extend(self.summary( + label='distribution', name=name, data=fn_summary, step='timesteps' + )) - def state_action_value(self, distr_params, action=None): - true_logit, false_logit, _, state_value = distr_params - if action is None: - state_value = tf.expand_dims(input=state_value, axis=-1) - logits = tf.stack(values=(false_logit, true_logit), axis=-1) - else: - logits = tf.where(condition=action, x=true_logit, y=false_logit) - return state_value + logits + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=probability, axis=0) - def tf_sample(self, distr_params, deterministic): - _, _, probability, _ = distr_params + dependencies.extend(self.track(label='distribution', name='probability', data=fn_tracking)) - # Deterministic: true if >= 0.5 - definite = tf.greater_equal(x=probability, y=0.5) + with tf.control_dependencies(control_inputs=dependencies): + return tf.greater_equal(x=probability, y=tf_util.constant(value=0.5, dtype='float')) - # Non-deterministic: sample true if >= uniform distribution - uniform = tf.random_uniform(shape=tf.shape(probability)) - sampled = tf.greater_equal(x=probability, y=uniform) + @tf_function(num_args=2) + def sample(self, *, parameters, temperature, independent): + true_logit, false_logit, probability = parameters.get( + ('true_logit', 'false_logit', 'probability') + ) - return tf.where(condition=deterministic, x=definite, y=sampled) + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + axis = range(self.action_spec.rank + 1) + return tf.math.reduce_mean(input_tensor=probability, axis=axis) + + name = 'distributions/' + self.name + '-probability' + dependencies.extend(self.summary( + label='distribution', name=name, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=probability, axis=0) + + dependencies.extend(self.track(label='distribution', name='probability', data=fn_tracking)) + + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + def fn_mode(): + # Deterministic: true if >= 0.5 + half = tf_util.constant(value=0.5, dtype='float') + return tf.greater_equal(x=probability, y=half) + + def fn_sample(): + # Non-deterministic: sample true if >= uniform distribution + # Exp numerically stable since logits <= 0.0 + e_true_logit = tf.math.exp(x=(true_logit / (temperature + epsilon))) + e_false_logit = tf.math.exp(x=(false_logit / (temperature + epsilon))) + probability = e_true_logit / (e_true_logit + e_false_logit + epsilon) + uniform = tf.random.uniform( + shape=tf.shape(input=probability), dtype=tf_util.get_dtype(type='float') + ) + return tf.greater_equal(x=probability, y=uniform) + + with tf.control_dependencies(control_inputs=dependencies): + return tf.cond(pred=(temperature < epsilon), true_fn=fn_mode, false_fn=fn_sample) + + @tf_function(num_args=2) + def log_probability(self, *, parameters, action): + true_logit, false_logit = parameters.get(('true_logit', 'false_logit')) - def tf_log_probability(self, distr_params, action): - true_logit, false_logit, _, _ = distr_params return tf.where(condition=action, x=true_logit, y=false_logit) - def tf_entropy(self, distr_params): - true_logit, false_logit, probability, _ = distr_params - return -probability * true_logit - (1.0 - probability) * false_logit + @tf_function(num_args=1) + def entropy(self, *, parameters): + true_logit, false_logit, probability = parameters.get( + ('true_logit', 'false_logit', 'probability') + ) + + one = tf_util.constant(value=1.0, dtype='float') + + return -probability * true_logit - (one - probability) * false_logit + + @tf_function(num_args=2) + def kl_divergence(self, *, parameters1, parameters2): + true_logit1, false_logit1, probability1 = parameters1.get( + ('true_logit', 'false_logit', 'probability') + ) + true_logit2, false_logit2 = parameters2.get(('true_logit', 'false_logit')) - def tf_kl_divergence(self, distr_params1, distr_params2): - true_logit1, false_logit1, probability1, _ = distr_params1 - true_logit2, false_logit2, _, _ = distr_params2 true_log_prob_ratio = true_logit1 - true_logit2 false_log_prob_ratio = false_logit1 - false_logit2 - return probability1 * true_log_prob_ratio + (1.0 - probability1) * false_log_prob_ratio - def tf_regularization_loss(self): - regularization_loss = super(Bernoulli, self).tf_regularization_loss() - if super(Bernoulli, self).tf_regularization_loss() is None: - losses = list() - else: - losses = [regularization_loss] + one = tf_util.constant(value=1.0, dtype='float') - regularization_loss = self.logit.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + return probability1 * true_log_prob_ratio + (one - probability1) * false_log_prob_ratio - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None + @tf_function(num_args=2) + def action_value(self, *, parameters, action): + true_logit, false_logit, state_value = parameters.get( + ('true_logit', 'false_logit', 'state_value') + ) - def get_variables(self, include_nontrainable=False): - distribution_variables = super(Bernoulli, self).get_variables(include_nontrainable=include_nontrainable) - logit_variables = self.logit.get_variables(include_nontrainable=include_nontrainable) + logits = tf.where(condition=action, x=true_logit, y=false_logit) - return distribution_variables + logit_variables + return state_value + logits - def get_summaries(self): - distribution_summaries = super(Bernoulli, self).get_summaries() - logit_summaries = self.logit.get_summaries() + @tf_function(num_args=1) + def state_value(self, *, parameters): + state_value = parameters['state_value'] - return distribution_summaries + logit_summaries + return state_value diff --git a/tensorforce/core/distributions/beta.py b/tensorforce/core/distributions/beta.py index 4587e1862..8445f50c9 100644 --- a/tensorforce/core/distributions/beta.py +++ b/tensorforce/core/distributions/beta.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,130 +13,261 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from math import log +import numpy as np import tensorflow as tf -from tensorforce import util -from tensorforce.core.networks import Linear +from tensorforce import TensorforceError, util +from tensorforce.core import layer_modules, TensorDict, TensorSpec, TensorsSpec, tf_function, \ + tf_util from tensorforce.core.distributions import Distribution class Beta(Distribution): """ - Beta distribution, for bounded continuous actions. + Beta distribution, for bounded continuous actions (specification key: `beta`). + + Args: + name (string): internal use. + action_spec (specification): internal use. + input_spec (specification): internal use. """ - def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): - """ - Beta distribution. - - Args: - shape: Action shape. - min_value: Minimum value of continuous actions. - max_value: Maximum value of continuous actions. - alpha: Optional distribution bias for the alpha value. - beta: Optional distribution bias for the beta value. - """ - assert min_value is None or max_value > min_value - self.shape = shape - self.min_value = min_value - self.max_value = max_value - action_size = util.prod(self.shape) - - self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') - self.beta = Linear(size=action_size, bias=beta, scope='beta') - - super(Beta, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_parameterize(self, x): + def __init__(self, *, name=None, action_spec=None, input_spec=None): + assert action_spec.type == 'float' and action_spec.min_value is not None and \ + action_spec.max_value is not None + + parameters_spec = TensorsSpec( + alpha=TensorSpec(type='float', shape=action_spec.shape), + beta=TensorSpec(type='float', shape=action_spec.shape), + alpha_beta=TensorSpec(type='float', shape=action_spec.shape), + log_norm=TensorSpec(type='float', shape=action_spec.shape) + ) + conditions_spec = TensorsSpec() + + super().__init__( + name=name, action_spec=action_spec, input_spec=input_spec, + parameters_spec=parameters_spec, conditions_spec=conditions_spec + ) + + if self.input_spec.rank == 1: + # Single embedding + action_size = util.product(xs=self.action_spec.shape, empty=0) + self.alpha = self.submodule( + name='alpha', module='linear', modules=layer_modules, size=action_size, + initialization_scale=0.01, input_spec=self.input_spec + ) + self.beta = self.submodule( + name='beta', module='linear', modules=layer_modules, size=action_size, + initialization_scale=0.01, input_spec=self.input_spec + ) + + else: + # Embedding per action + if self.input_spec.rank < 1 or self.input_spec.rank > 3: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='invalid rank' + ) + if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: + size = self.action_spec.shape[-1] + elif self.input_spec.shape[:-1] == self.action_spec.shape: + size = 0 + else: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='not flattened and incompatible with action shape' + ) + self.alpha = self.submodule( + name='alpha', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + self.beta = self.submodule( + name='beta', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + + def get_architecture(self): + return 'Alpha: {}\nBeta: {}'.format( + self.alpha.get_architecture(), self.beta.get_architecture() + ) + + def initialize(self): + super().initialize() + + prefix = 'distributions/' + self.name + names = (prefix + '-alpha', prefix + '-beta') + self.register_summary(label='distribution', name=names) + + spec = self.parameters_spec['alpha'] + self.register_tracking(label='distribution', name='alpha', spec=spec) + self.register_tracking(label='distribution', name='beta', spec=spec) + + @tf_function(num_args=2) + def parametrize(self, *, x, conditions): # Softplus to ensure alpha and beta >= 1 - # epsilon < 1.0, hence negative - log_eps = log(util.epsilon) + one = tf_util.constant(value=1.0, dtype='float') + log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') + shape = (-1,) + self.action_spec.shape + # Alpha alpha = self.alpha.apply(x=x) - alpha = tf.clip_by_value(t=alpha, clip_value_min=log_eps, clip_value_max=-log_eps) - alpha = tf.log(x=(tf.exp(x=alpha) + 1.0)) + 1.0 + # epsilon < 1.0, hence negative + alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) + alpha = tf.math.exp(x=alpha) + one # tf.math.softplus(features=beta) ??? + if self.input_spec.rank == 1: + alpha = tf.reshape(tensor=alpha, shape=shape) + # Beta beta = self.beta.apply(x=x) - beta = tf.clip_by_value(t=beta, clip_value_min=log_eps, clip_value_max=-log_eps) - beta = tf.log(x=(tf.exp(x=beta) + 1.0)) + 1.0 + # epsilon < 1.0, hence negative + beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) + beta = tf.math.exp(x=beta) + one # tf.math.softplus(features=beta) ??? + if self.input_spec.rank == 1: + beta = tf.reshape(tensor=beta, shape=shape) - shape = (-1,) + self.shape - alpha = tf.reshape(tensor=alpha, shape=shape) - beta = tf.reshape(tensor=beta, shape=shape) + # Alpha + Beta + alpha_beta = alpha + beta # > 2.0 so no +epsilon required - alpha_beta = tf.maximum(x=(alpha + beta), y=util.epsilon) - log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma(x=alpha_beta) + # Log norm + log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta) - return alpha, beta, alpha_beta, log_norm + return TensorDict(alpha=alpha, beta=beta, alpha_beta=alpha_beta, log_norm=log_norm) - def tf_sample(self, distr_params, deterministic): - alpha, beta, alpha_beta, _ = distr_params + @tf_function(num_args=1) + def mode(self, *, parameters, independent): + alpha, beta, alpha_beta = parameters.get(('alpha', 'beta', 'alpha_beta')) - # Deterministic: mean as action - definite = beta / alpha_beta + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + a = tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)) + b = tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1)) + return a, b - # Non-deterministic: sample action using gamma distribution - alpha_sample = tf.random_gamma(shape=(), alpha=alpha) - beta_sample = tf.random_gamma(shape=(), alpha=beta) + prefix = 'distributions/' + self.name + names = (prefix + '-alpha', prefix + '-beta') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) - sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=util.epsilon) + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=alpha, axis=0) - return self.min_value + (self.max_value - self.min_value) * \ - tf.where(condition=deterministic, x=definite, y=sampled) + dependencies.extend(self.track(label='distribution', name='alpha', data=fn_tracking)) - def tf_log_probability(self, distr_params, action): - alpha, beta, _, log_norm = distr_params - action = (action - self.min_value) / (self.max_value - self.min_value) - action = tf.minimum(x=action, y=(1.0 - util.epsilon)) - return (beta - 1.0) * tf.log(x=tf.maximum(x=action, y=util.epsilon)) + \ - (alpha - 1.0) * tf.log1p(x=-action) - log_norm + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=beta, axis=0) - def tf_entropy(self, distr_params): - alpha, beta, alpha_beta, log_norm = distr_params - return log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) + \ - (alpha_beta - 2.0) * tf.digamma(x=alpha_beta) + dependencies.extend(self.track(label='distribution', name='beta', data=fn_tracking)) - def tf_kl_divergence(self, distr_params1, distr_params2): - alpha1, beta1, alpha_beta1, log_norm1 = distr_params1 - alpha2, beta2, alpha_beta2, log_norm2 = distr_params2 - return log_norm2 - log_norm1 - tf.digamma(x=beta1) * (beta2 - beta1) - \ - tf.digamma(x=alpha1) * (alpha2 - alpha1) + tf.digamma(x=alpha_beta1) * (alpha_beta2 - alpha_beta1) + with tf.control_dependencies(control_inputs=dependencies): + action = beta / alpha_beta - def tf_regularization_loss(self): - regularization_loss = super(Beta, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') - regularization_loss = self.alpha.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + return min_value + (max_value - min_value) * action - regularization_loss = self.beta.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + @tf_function(num_args=2) + def sample(self, *, parameters, temperature, independent): + alpha, beta, alpha_beta, log_norm = parameters.get( + ('alpha', 'beta', 'alpha_beta', 'log_norm') + ) - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + a = tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)) + b = tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1)) + return a, b + + prefix = 'distributions/' + self.name + names = (prefix + '-alpha', prefix + '-beta') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=alpha, axis=0) + + dependencies.extend(self.track(label='distribution', name='alpha', data=fn_tracking)) + + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=beta, axis=0) + + dependencies.extend(self.track(label='distribution', name='beta', data=fn_tracking)) + + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + def fn_mode(): + # Deterministic: mean as action + return beta / alpha_beta + + def fn_sample(): + # Non-deterministic: sample action using gamma distribution + alpha_sample = tf.random.gamma(shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float')) + beta_sample = tf.random.gamma(shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float')) + return beta_sample / (alpha_sample + beta_sample) + + action = tf.cond(pred=(temperature < epsilon), true_fn=fn_mode, false_fn=fn_sample) + + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + + with tf.control_dependencies(control_inputs=dependencies): + return min_value + (max_value - min_value) * action + + @tf_function(num_args=2) + def log_probability(self, *, parameters, action): + alpha, beta, log_norm = parameters.get(('alpha', 'beta', 'log_norm')) + + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + + action = (action - min_value) / (max_value - min_value) + + one = tf_util.constant(value=1.0, dtype='float') + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + return tf.math.xlogy(x=(beta - one), y=(action + epsilon)) + \ + (alpha - one) * tf.math.log1p(x=(-action + epsilon)) - log_norm + + @tf_function(num_args=1) + def entropy(self, *, parameters): + alpha, beta, alpha_beta, log_norm = parameters.get( + ('alpha', 'beta', 'alpha_beta', 'log_norm') + ) + + one = tf_util.constant(value=1.0, dtype='float') + + digamma_alpha = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float') + digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float') + digamma_alpha_beta = tf_util.cast( + x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float' + ) - def get_variables(self, include_nontrainable=False): - distribution_variables = super(Beta, self).get_variables(include_nontrainable=include_nontrainable) - alpha_variables = self.alpha.get_variables(include_nontrainable=include_nontrainable) - beta_variables = self.beta.get_variables(include_nontrainable=include_nontrainable) + return log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \ + (alpha_beta - one - one) * digamma_alpha_beta - return distribution_variables + alpha_variables + beta_variables + @tf_function(num_args=2) + def kl_divergence(self, *, parameters1, parameters2): + alpha1, beta1, alpha_beta1, log_norm1 = parameters1.get( + ('alpha', 'beta', 'alpha_beta', 'log_norm') + ) + alpha2, beta2, alpha_beta2, log_norm2 = parameters2.get( + ('alpha', 'beta', 'alpha_beta', 'log_norm') + ) - def get_summaries(self): - distribution_summaries = super(Beta, self).get_summaries() - alpha_summaries = self.alpha.get_summaries() - beta_summaries = self.beta.get_summaries() + digamma_alpha1 = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=alpha1)), dtype='float') + digamma_beta1 = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta1)), dtype='float') + digamma_alpha_beta1 = tf_util.cast( + x=tf.math.digamma(x=tf_util.float32(x=alpha_beta1)), dtype='float' + ) - return distribution_summaries + alpha_summaries + beta_summaries + return log_norm2 - log_norm1 - digamma_beta1 * (beta2 - beta1) - \ + digamma_alpha1 * (alpha2 - alpha1) + digamma_alpha_beta1 * \ + (alpha_beta2 - alpha_beta1) diff --git a/tensorforce/core/distributions/categorical.py b/tensorforce/core/distributions/categorical.py index 67dfc332f..942b86a9d 100755 --- a/tensorforce/core/distributions/categorical.py +++ b/tensorforce/core/distributions/categorical.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,135 +13,399 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from math import log +import numpy as np import tensorflow as tf -from tensorforce import util -from tensorforce.core.networks import Linear +from tensorforce import TensorforceError, util +from tensorforce.core import layer_modules, TensorDict, TensorSpec, TensorsSpec, tf_function, \ + tf_util from tensorforce.core.distributions import Distribution class Categorical(Distribution): """ - Categorical distribution, for discrete actions. + Categorical distribution, for discrete integer actions (specification key: `categorical`). + + Args: + temperature_mode ("predicted" | "global"): Whether to predict the temperature via a linear + transformation of the state embedding, or to parametrize the temperature by a separate + set of trainable weights + (default: default temperature of 1). + skip_linear (bool): Whether to not add the implicit linear logits layer, requires suitable + network output shape according to action space, not compatible with temperature_mode + (default: false). + name (string): internal use. + action_spec (specification): internal use. + input_spec (specification): internal use. """ - def __init__(self, shape, num_actions, probabilities=None, scope='categorical', summary_labels=()): - """ - Categorical distribution. - - Args: - shape: Action shape. - num_actions: Number of discrete action alternatives. - probabilities: Optional distribution bias. - """ - self.num_actions = num_actions - - action_size = util.prod(shape) * self.num_actions - if probabilities is None: - logits = 0.0 + def __init__( + self, *, temperature_mode=None, skip_linear=False, name=None, action_spec=None, + input_spec=None + ): + assert action_spec.type == 'int' and action_spec.num_values is not None + assert not skip_linear or temperature_mode is None + + if temperature_mode is None: + parameters_spec = TensorsSpec( + probabilities=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), logits=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), action_values=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), state_value=TensorSpec(type='float', shape=action_spec.shape) + ) else: - logits = [log(prob) for _ in range(util.prod(shape)) for prob in probabilities] - self.logits = Linear(size=action_size, bias=logits, scope='logits') - - super(Categorical, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) + parameters_spec = TensorsSpec( + probabilities=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), temperature=TensorSpec(type='float', shape=action_spec.shape), + logits=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), action_values=TensorSpec( + type='float', shape=(action_spec.shape + (action_spec.num_values,)) + ), state_value=TensorSpec(type='float', shape=action_spec.shape) + ) + conditions_spec = TensorsSpec() + + super().__init__( + name=name, action_spec=action_spec, input_spec=input_spec, + parameters_spec=parameters_spec, conditions_spec=conditions_spec + ) - def tf_parameterize(self, x): - # Flat logits - logits = self.logits.apply(x=x) + self.temperature_mode = temperature_mode + + if self.config.enable_int_action_masking: + self.conditions_spec['mask'] = TensorSpec( + type='bool', shape=(self.action_spec.shape + (self.action_spec.num_values,)) + ) + + num_values = self.action_spec.num_values + if skip_linear: + self.action_values = None + if self.input_spec.shape[:-1] != self.action_spec.shape or \ + self.input_spec.shape[-1] != self.action_spec.num_values: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='!= (*action_shape, num_values)' + ) + elif self.input_spec.shape[:-1] != self.action_spec.shape and \ + self.input_spec.shape[:-1] != self.action_spec.shape[:-1]: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='not flattened and incompatible with action shape' + ) + + elif self.input_spec.rank == 1: + # Single embedding + self.action_values = self.submodule( + name='action_values', module='linear', modules=layer_modules, + size=(self.action_spec.size * num_values), initialization_scale=0.01, + input_spec=input_spec + ) + if self.temperature_mode == 'predicted': + self.temperature = self.submodule( + name='temperature', module='linear', modules=layer_modules, + size=self.action_spec.size, initialization_scale=0.01, + input_spec=self.input_spec + ) - # Reshape logits to action shape - shape = (-1,) + self.shape + (self.num_actions,) - logits = tf.reshape(tensor=logits, shape=shape) + else: + # Embedding per action + if self.input_spec.rank < 1 or self.input_spec.rank > 3: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='invalid rank' + ) + if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: + size = self.action_spec.shape[-1] + elif self.input_spec.shape[:-1] == self.action_spec.shape: + size = 1 + else: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='not flattened and incompatible with action shape' + ) + self.action_values = self.submodule( + name='action_values', module='linear', modules=layer_modules, + size=(size * num_values), initialization_scale=0.01, input_spec=input_spec + ) + if self.temperature_mode == 'predicted': + self.temperature = self.submodule( + name='temperature', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + + def get_architecture(self): + architecture = '' + if self.action_values is not None: + architecture += 'Logits: {}'.format(self.action_values.get_architecture()) + if self.temperature_mode == 'predicted': + architecture += '\nTemperature: {}'.format(self.temperature.get_architecture()) + return architecture + + def initialize(self): + super().initialize() + + if self.temperature_mode == 'global': + spec = TensorSpec(type='float', shape=((1,) + self.action_spec.shape + (1,))) + self.temperature = self.variable( + name='temperature', spec=spec, initializer='zeros', is_trainable=True, + is_saved=True + ) + + prefix = 'distributions/' + self.name + '-probability' + names = [prefix + str(n) for n in range(self.action_spec.num_values)] + if self.temperature_mode is not None: + names.append('distributions/' + self.name + '-temperature') + self.register_summary(label='distribution', name=names) + + spec = self.parameters_spec['probabilities'] + self.register_tracking(label='distribution', name='probabilities', spec=spec) + + if self.temperature_mode is not None: + spec = self.parameters_spec['temperature'] + self.register_tracking(label='distribution', name='temperature', spec=spec) + + @tf_function(num_args=2) + def parametrize(self, *, x, conditions): + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') + log_two = tf_util.constant(value=np.log(2.0), dtype='float') + + # Action values + if self.action_values is None: + action_values = x + else: + action_values = self.action_values.apply(x=x) + shape = (-1,) + self.action_spec.shape + (self.action_spec.num_values,) + action_values = tf.reshape(tensor=action_values, shape=shape) + + # Softplus standard deviation + if self.temperature_mode == 'global': + multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank + 1)) + softplus_temperature = tf.tile(input=self.temperature, multiples=multiples) + elif self.temperature_mode == 'predicted': + softplus_temperature = self.temperature.apply(x=x) + shape = (-1,) + self.action_spec.shape + (1,) + softplus_temperature = tf.reshape(tensor=softplus_temperature, shape=shape) + + if self.temperature_mode is None: + # Logits + logits = action_values + + # Implicit states value + state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) - # !!! - state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) + else: + # Clip softplus_temperature for numerical stability (epsilon < 1.0, hence negative) + softplus_temperature = tf.clip_by_value( + t=softplus_temperature, clip_value_min=log_epsilon, clip_value_max=-log_epsilon + ) + + # Softplus transformation (based on https://arxiv.org/abs/2007.06059) + softplus_shift = tf_util.constant(value=0.2, dtype='float') + temperature = (tf.nn.softplus(features=softplus_temperature) + softplus_shift) / \ + (log_two + softplus_shift) + + # Logits + logits = action_values / temperature + + # Implicit states value + temperature = tf.squeeze(input=temperature, axis=-1) + state_value = temperature * tf.reduce_logsumexp(input_tensor=logits, axis=-1) + + # # Explicit states value and advantage-based action values + # state_value = self.state_value.apply(x=x) + # state_value = tf.reshape(tensor=state_value, shape=shape[:-1]) + # action_values = tf.expand_dims(input=state_value, axis=-1) + action_values + # action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True) + + # Action masking, affects action_values/probabilities/logits but not state_value + if self.config.enable_int_action_masking: + min_float = tf.fill( + dims=tf.shape(input=action_values), value=tf_util.get_dtype(type='float').min + ) + action_values = tf.where(condition=conditions['mask'], x=action_values, y=min_float) + logits = tf.where(condition=conditions['mask'], x=logits, y=min_float) # Softmax for corresponding probabilities - # TODO deprecated call, update when >1.5 becomes default install - probabilities = tf.nn.softmax(logits=logits, dim=-1) - - # Min epsilon probability for numerical stability - probabilities = tf.maximum(x=probabilities, y=util.epsilon) + probabilities = tf.nn.softmax(logits=logits, axis=-1) # "Normalized" logits - logits = tf.log(x=probabilities) - - return logits, probabilities, state_value + logits = tf.math.log(x=(probabilities + epsilon)) + # Unstable + # logits = tf.nn.log_softmax(logits=logits, axis=-1) + # Doesn't take masking into account + # logits = action_values - tf.expand_dims(input=state_value, axis=-1) ... / temperature + + if self.temperature_mode is None: + return TensorDict( + probabilities=probabilities, logits=logits, action_values=action_values, + state_value=state_value + ) + else: + return TensorDict( + probabilities=probabilities, temperature=temperature, logits=logits, + action_values=action_values, state_value=state_value + ) + + @tf_function(num_args=1) + def mode(self, *, parameters, independent): + if self.temperature_mode is None: + probabilities, action_values = parameters.get(('probabilities', 'action_values')) + else: + probabilities, temperature, action_values = parameters.get( + ('probabilities', 'temperature', 'action_values') + ) + + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + axis = range(self.action_spec.rank + 1) + probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) + probs = [probs[n] for n in range(self.action_spec.num_values)] + if self.temperature_mode is not None: + probs.append(tf.math.reduce_mean(input_tensor=temperature, axis=axis)) + return probs + + prefix = 'distributions/' + self.name + '-probability' + names = [prefix + str(n) for n in range(self.action_spec.num_values)] + if self.temperature_mode is not None: + names.append('distributions/' + self.name + '-temperature') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=probabilities, axis=0) + + dependencies.extend( + self.track(label='distribution', name='probabilities', data=fn_tracking) + ) - def state_value(self, distr_params): - _, _, state_value = distr_params - return state_value + if self.temperature_mode is not None: - def state_action_value(self, distr_params, action=None): - logits, _, state_value = distr_params - if action is None: - state_value = tf.expand_dims(input=state_value, axis=-1) - else: - one_hot = tf.one_hot(indices=action, depth=self.num_actions) - logits = tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) - return state_value + logits + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=temperature, axis=0) - def tf_sample(self, distr_params, deterministic): - logits, _, _ = distr_params + dependencies.extend( + self.track(label='distribution', name='temperature', data=fn_tracking) + ) - # Deterministic: maximum likelihood action - definite = tf.argmax(input=logits, axis=-1, output_type=util.tf_dtype('int')) + with tf.control_dependencies(control_inputs=dependencies): + action = tf.math.argmax(input=action_values, axis=-1) + return tf_util.cast(x=action, dtype='int') - # Non-deterministic: sample action using Gumbel distribution - uniform_distribution = tf.random_uniform( - shape=tf.shape(input=logits), - minval=util.epsilon, - maxval=(1.0 - util.epsilon) + @tf_function(num_args=2) + def sample(self, *, parameters, temperature, independent): + if self.temperature_mode is None: + probabilities, logits, action_values = parameters.get( + ('probabilities', 'logits', 'action_values') + ) + else: + probabilities, temp, logits, action_values = parameters.get( + ('probabilities', 'temperature', 'logits', 'action_values') + ) + + # Distribution parameter summaries + dependencies = list() + if not independent: + def fn_summary(): + axis = range(self.action_spec.rank + 1) + probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) + probs = [probs[n] for n in range(self.action_spec.num_values)] + if self.temperature_mode is not None: + probs.append(tf.math.reduce_mean(input_tensor=temp, axis=axis)) + return probs + + prefix = 'distributions/' + self.name + '-probability' + names = [prefix + str(n) for n in range(self.action_spec.num_values)] + names.append('distributions/' + self.name + '-temperature') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=probabilities, axis=0) + + dependencies.extend( + self.track(label='distribution', name='probabilities', data=fn_tracking) ) - gumbel_distribution = -tf.log(x=-tf.log(x=uniform_distribution)) - sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1, output_type=util.tf_dtype('int')) - return tf.where(condition=deterministic, x=definite, y=sampled) + if self.temperature_mode is not None: + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=temp, axis=0) + + dependencies.extend( + self.track(label='distribution', name='temperature', data=fn_tracking) + ) + + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + def fn_mode(): + # Deterministic: maximum likelihood action + action = tf.math.argmax(input=action_values, axis=-1) + return tf_util.cast(x=action, dtype='int') + + def fn_sample(): + # Set logits to minimal value + min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) + temp_logits = logits / (temperature + epsilon) + temp_logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=temp_logits) + + # Non-deterministic: sample action using Gumbel distribution + one = tf_util.constant(value=1.0, dtype='float') + uniform_distribution = tf.random.uniform( + shape=tf.shape(input=temp_logits), minval=epsilon, maxval=(one - epsilon), + dtype=tf_util.get_dtype(type='float') + ) + # Second log numerically stable since log(1-eps) ~ -eps + gumbel_distribution = -tf.math.log(x=-tf.math.log(x=uniform_distribution)) + action = tf.math.argmax(input=(temp_logits + gumbel_distribution), axis=-1) + return tf_util.cast(x=action, dtype='int') + + with tf.control_dependencies(control_inputs=dependencies): + return tf.cond(pred=(temperature < epsilon), true_fn=fn_mode, false_fn=fn_sample) + + @tf_function(num_args=2) + def log_probability(self, *, parameters, action): + logits = parameters['logits'] + + rank = self.action_spec.rank + 1 + action = tf.expand_dims(input=action, axis=rank) + logit = tf.gather(params=logits, indices=action, batch_dims=rank) + return tf.squeeze(input=logit, axis=rank) + + @tf_function(num_args=1) + def entropy(self, *, parameters): + probabilities, logits = parameters.get(('probabilities', 'logits')) - def tf_log_probability(self, distr_params, action): - logits, _, _ = distr_params - one_hot = tf.one_hot(indices=action, depth=self.num_actions) - return tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) - - def tf_entropy(self, distr_params): - logits, probabilities, _ = distr_params return -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) - def tf_kl_divergence(self, distr_params1, distr_params2): - logits1, probabilities1, _ = distr_params1 - logits2, _, _ = distr_params2 - log_prob_ratio = logits1 - logits2 - return tf.reduce_sum(input_tensor=(probabilities1 * log_prob_ratio), axis=-1) + @tf_function(num_args=2) + def kl_divergence(self, *, parameters1, parameters2): + probabilities1, logits1 = parameters1.get(('probabilities', 'logits')) + logits2 = parameters2['logits'] - def tf_regularization_loss(self): - regularization_loss = super(Categorical, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - regularization_loss = self.logits.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + log_prob_ratio = logits1 - logits2 - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None + return tf.reduce_sum(input_tensor=(probabilities1 * log_prob_ratio), axis=-1) - def get_variables(self, include_nontrainable=False): - distribution_variables = super(Categorical, self).get_variables(include_nontrainable=include_nontrainable) - logits_variables = self.logits.get_variables(include_nontrainable=include_nontrainable) + @tf_function(num_args=2) + def action_value(self, *, parameters, action): + action_values = parameters['action_values'] - return distribution_variables + logits_variables + rank = self.action_spec.rank + 1 + action = tf.expand_dims(input=action, axis=rank) + action_value = tf.gather(params=action_values, indices=action, batch_dims=rank) - def get_summaries(self): - distribution_summaries = super(Categorical, self).get_summaries() - logits_summaries = self.logits.get_summaries() + return tf.squeeze(input=action_value, axis=rank) - return distribution_summaries + logits_summaries + @tf_function(num_args=1) + def state_value(self, *, parameters): + return parameters['state_value'] diff --git a/tensorforce/core/distributions/distribution.py b/tensorforce/core/distributions/distribution.py index 7eb78399a..f0b82fa7a 100755 --- a/tensorforce/core/distributions/distribution.py +++ b/tensorforce/core/distributions/distribution.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,184 +13,147 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +from tensorforce.core import Module, SignatureDict, TensorSpec, tf_function -import tensorflow as tf -from tensorforce import util -import tensorforce.core.distributions - - -class Distribution(object): +class Distribution(Module): """ Base class for policy distributions. + + Args: + name (string): internal use. + action_spec (specification): internal use. + input_spec (specification): internal use. + parameters_spec (specification): internal use. + conditions_spec (specification): internal use. """ - def __init__(self, shape, scope='distribution', summary_labels=None): - """ - Distribution. - - Args: - shape: Action shape. - """ - self.shape = shape - - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.all_variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.all_variables[name] = variable - if kwargs.get('trainable', True): - self.variables[name] = variable - if 'variables' in self.summary_labels: - summary = tf.summary.histogram(name=name, values=variable) - self.summaries.append(summary) - return variable - - self.parameterize = tf.make_template( - name_=(scope + '/parameterize'), - func_=self.tf_parameterize, - custom_getter_=custom_getter - ) - self.sample = tf.make_template( - name_=(scope + '/sample'), - func_=self.tf_sample, - custom_getter_=custom_getter - ) - self.log_probability = tf.make_template( - name_=(scope + '/log-probability'), - func_=self.tf_log_probability, - custom_getter_=custom_getter - ) - self.entropy = tf.make_template( - name_=(scope + '/entropy'), - func_=self.tf_entropy, - custom_getter_=custom_getter - ) - self.kl_divergence = tf.make_template( - name_=(scope + '/kl-divergence'), - func_=self.tf_kl_divergence, - custom_getter_=custom_getter - ) - self.regularization_loss = tf.make_template( - name_=(scope + '/regularization-loss'), - func_=self.tf_regularization_loss, - custom_getter_=custom_getter - ) - - def tf_parameterize(self, x): - """ - Creates the TensorFlow operations for parameterizing a distribution conditioned on the - given input. - - Args: - x: Input tensor which the distribution is conditioned on. - - Returns: - Tuple of distribution parameter tensors. - """ + def __init__( + self, *, name=None, action_spec=None, input_spec=None, parameters_spec=None, + conditions_spec=None + ): + assert input_spec.type == 'float' + super().__init__(l2_regularization=0.0, name=name) + + self.action_spec = action_spec + self.input_spec = input_spec + self.parameters_spec = parameters_spec + self.conditions_spec = conditions_spec + + def get_architecture(self): raise NotImplementedError - def tf_sample(self, distr_params, deterministic): - """ - Creates the TensorFlow operations for sampling an action based on a distribution. + def input_signature(self, *, function): + if function == 'action_value': + return SignatureDict( + parameters=self.parameters_spec.signature(batched=True), + action=self.action_spec.signature(batched=True) + ) + + elif function == 'entropy': + return SignatureDict(parameters=self.parameters_spec.signature(batched=True)) + + elif function == 'kl_divergence': + return SignatureDict( + parameters1=self.parameters_spec.signature(batched=True), + parameters2=self.parameters_spec.signature(batched=True) + ) + + elif function == 'log_probability': + return SignatureDict( + parameters=self.parameters_spec.signature(batched=True), + action=self.action_spec.signature(batched=True) + ) + + elif function == 'mode': + return SignatureDict(parameters=self.parameters_spec.signature(batched=True)) + + elif function == 'parametrize': + return SignatureDict( + x=self.input_spec.signature(batched=True), + conditions=self.conditions_spec.signature(batched=True) + ) + + elif function == 'sample': + return SignatureDict( + parameters=self.parameters_spec.signature(batched=True), + temperature=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + elif function == 'state_value': + return SignatureDict(parameters=self.parameters_spec.signature(batched=True)) - Args: - distr_params: Tuple of distribution parameter tensors. - deterministic: Boolean input tensor indicating whether the maximum likelihood action - should be returned. + else: + return super().input_signature(function=function) - Returns: - Sampled action tensor. - """ - raise NotImplementedError + def output_signature(self, *, function): + shape = self.action_spec.shape - def tf_log_probability(self, distr_params, action): - """ - Creates the TensorFlow operations for calculating the log probability of an action for a - distribution. + if function == 'action_value': + return SignatureDict( + singleton=TensorSpec(type='float', shape=shape).signature(batched=True) + ) - Args: - distr_params: Tuple of distribution parameter tensors. - action: Action tensor. + elif function == 'entropy': + return SignatureDict( + singleton=TensorSpec(type='float', shape=shape).signature(batched=True) + ) - Returns: - KL divergence tensor. - """ - raise NotImplementedError + elif function == 'kl_divergence': + return SignatureDict( + singleton=TensorSpec(type='float', shape=shape).signature(batched=True) + ) + + elif function == 'log_probability': + return SignatureDict( + singleton=TensorSpec(type='float', shape=shape).signature(batched=True) + ) + + elif function == 'mode': + return SignatureDict(singleton=self.action_spec.signature(batched=True)) - def tf_entropy(self, distr_params): - """ - Creates the TensorFlow operations for calculating the entropy of a distribution. + elif function == 'parametrize': + return SignatureDict(singleton=self.parameters_spec.signature(batched=True)) - Args: - distr_params: Tuple of distribution parameter tensors. + elif function == 'sample': + return SignatureDict(singleton=self.action_spec.signature(batched=True)) - Returns: - Entropy tensor. - """ + elif function == 'state_value': + return SignatureDict( + singleton=TensorSpec(type='float', shape=shape).signature(batched=True) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=2) + def parametrize(self, *, x, conditions): raise NotImplementedError - def tf_kl_divergence(self, distr_params1, distr_params2): - """ - Creates the TensorFlow operations for calculating the KL divergence between two - distributions. + @tf_function(num_args=1) + def mode(self, *, parameters, independent): + raise NotImplementedError - Args: - distr_params1: Tuple of parameter tensors for first distribution. - distr_params2: Tuple of parameter tensors for second distribution. + @tf_function(num_args=2) + def sample(self, *, parameters, temperature, independent): + raise NotImplementedError - Returns: - KL divergence tensor. - """ + @tf_function(num_args=2) + def log_probability(self, *, parameters, action): raise NotImplementedError - def tf_regularization_loss(self): - """ - Creates the TensorFlow operations for the distribution regularization loss. + @tf_function(num_args=1) + def entropy(self, *, parameters): + raise NotImplementedError - Returns: - Regularization loss tensor. - """ - return None + @tf_function(num_args=2) + def kl_divergence(self, *, parameters1, parameters2): + raise NotImplementedError - def get_variables(self, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the distribution. + @tf_function(num_args=2) + def action_value(self, *, parameters, action): + raise NotImplementedError - Returns: - List of variables. - """ - if include_nontrainable: - return [self.all_variables[key] for key in sorted(self.all_variables)] - else: - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the distribution. - - Returns: - List of summaries. - """ - return self.summaries - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a distribution from a specification dict. - """ - distribution = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.distributions.distributions, - kwargs=kwargs - ) - assert isinstance(distribution, Distribution) - return distribution + @tf_function(num_args=1) + def state_value(self, *, parameters): + raise NotImplementedError diff --git a/tensorforce/core/distributions/gaussian.py b/tensorforce/core/distributions/gaussian.py index 2b9439bd4..6ebac538f 100755 --- a/tensorforce/core/distributions/gaussian.py +++ b/tensorforce/core/distributions/gaussian.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,132 +13,402 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from math import e, log, pi +import numpy as np import tensorflow as tf -from tensorforce import util -from tensorforce.core.networks import Linear +from tensorforce import TensorforceError, util +from tensorforce.core import layer_modules, TensorDict, TensorSpec, TensorsSpec, tf_function, \ + tf_util from tensorforce.core.distributions import Distribution class Gaussian(Distribution): """ - Gaussian distribution, for unbounded continuous actions. + Gaussian distribution, for continuous actions (specification key: `gaussian`). + + Args: + stddev_mode ("predicted" | "global"): Whether to predict the standard deviation via a linear + transformation of the state embedding, or to parametrize the standard deviation by a + separate set of trainable weights + (default: "predicted"). + bounded_transform ("clipping" | "tanh"): Transformation to adjust sampled actions in case of + bounded action space, "tanh" transforms distribution (e.g. log probability computation) + accordingly whereas "clipping" does not + (default: tanh). + name (string): internal use. + action_spec (specification): internal use. + input_spec (specification): internal use. """ - def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()): - """ - Categorical distribution. - - Args: - shape: Action shape. - mean: Optional distribution bias for the mean. - log_stddev: Optional distribution bias for the standard deviation. - """ - self.shape = shape - action_size = util.prod(self.shape) - - self.mean = Linear(size=action_size, bias=mean, scope='mean') - self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev') + def __init__( + self, *, stddev_mode='predicted', bounded_transform='tanh', name=None, action_spec=None, + input_spec=None + ): + assert action_spec.type == 'float' + + parameters_spec = TensorsSpec( + mean=TensorSpec(type='float', shape=action_spec.shape), + stddev=TensorSpec(type='float', shape=action_spec.shape), + log_stddev=TensorSpec(type='float', shape=action_spec.shape) + ) + conditions_spec = TensorsSpec() + + super().__init__( + name=name, action_spec=action_spec, input_spec=input_spec, + parameters_spec=parameters_spec, conditions_spec=conditions_spec + ) + + self.stddev_mode = stddev_mode + + if bounded_transform is None: + bounded_transform = 'tanh' + if bounded_transform not in ('clipping', 'tanh'): + raise TensorforceError.value( + name='Gaussian', argument='bounded_transform', value=bounded_transform, + hint='not in {clipping,tanh}' + ) + elif bounded_transform == 'tanh' and ( + (self.action_spec.min_value is not None) is not (self.action_spec.max_value is not None) + ): + raise TensorforceError.value( + name='Gaussian', argument='bounded_transform', value=bounded_transform, + condition='one-sided bounded action space' + ) + elif self.action_spec.min_value is None and self.action_spec.max_value is None: + bounded_transform = None + self.bounded_transform = bounded_transform + + if self.input_spec.rank == 1: + # Single embedding + self.mean = self.submodule( + name='mean', module='linear', modules=layer_modules, size=self.action_spec.size, + initialization_scale=0.01, input_spec=self.input_spec + ) + if self.stddev_mode == 'predicted': + self.stddev = self.submodule( + name='stddev', module='linear', modules=layer_modules, + size=self.action_spec.size, initialization_scale=0.01, + input_spec=self.input_spec + ) - super(Gaussian, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_parameterize(self, x): - # Flat mean and log standard deviation + else: + # Embedding per action + if self.input_spec.rank < 1 or self.input_spec.rank > 3: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.embedding_shape, + hint='invalid rank' + ) + elif self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: + size = self.action_spec.shape[-1] + elif self.input_spec.shape[:-1] == self.action_spec.shape: + size = 0 + else: + raise TensorforceError.value( + name=name, argument='input_spec.shape', value=self.input_spec.shape, + hint='not flattened and incompatible with action shape' + ) + self.mean = self.submodule( + name='mean', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + if self.stddev_mode == 'predicted': + self.stddev = self.submodule( + name='stddev', module='linear', modules=layer_modules, size=size, + initialization_scale=0.01, input_spec=self.input_spec + ) + + def get_architecture(self): + architecture = 'Mean: {}'.format(self.mean.get_architecture()) + if self.stddev_mode == 'predicted': + architecture += '\nStddev: {}'.format(self.stddev.get_architecture()) + return architecture + + def initialize(self): + super().initialize() + + if self.stddev_mode == 'global': + spec = TensorSpec(type='float', shape=((1,) + self.action_spec.shape)) + self.stddev = self.variable( + name='stddev', spec=spec, initializer='zeros', is_trainable=True, is_saved=True + ) + + prefix = 'distributions/' + self.name + names = (prefix + '-mean', prefix + '-stddev') + self.register_summary(label='distribution', name=names) + + spec = self.parameters_spec['mean'] + self.register_tracking(label='distribution', name='mean', spec=spec) + self.register_tracking(label='distribution', name='stddev', spec=spec) + + @tf_function(num_args=2) + def parametrize(self, *, x, conditions): + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') + + # Mean mean = self.mean.apply(x=x) - log_stddev = self.log_stddev.apply(x=x) - - # Reshape mean and log stddev to action shape - shape = (-1,) + self.shape - mean = tf.reshape(tensor=mean, shape=shape) - log_stddev = tf.reshape(tensor=log_stddev, shape=shape) - - # Clip log stddev for numerical stability - log_eps = log(util.epsilon) # epsilon < 1.0, hence negative - log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_eps, clip_value_max=-log_eps) - - # Standard deviation - stddev = tf.exp(x=log_stddev) + if self.input_spec.rank == 1: + shape = (-1,) + self.action_spec.shape + mean = tf.reshape(tensor=mean, shape=shape) + + # Softplus standard deviation + if self.stddev_mode == 'global': + multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank)) + softplus_stddev = tf.tile(input=self.stddev, multiples=multiples) + else: + softplus_stddev = self.stddev.apply(x=x) + if self.input_spec.rank == 1: + softplus_stddev = tf.reshape(tensor=softplus_stddev, shape=shape) + + # # Shift softplus_stddev to reduce zero value to 0.25 (TODO: 0.25 random choice) + # if self.action_spec.min_value is not None and self.action_spec.max_value is not None: + # softplus_stddev += tf_util.constant(value=np.log(0.25), dtype='float') + + # Clip softplus_stddev for numerical stability (epsilon < 1.0, hence negative) + softplus_stddev = tf.clip_by_value( + t=softplus_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon + ) + + # Softplus transformation (based on https://arxiv.org/abs/2007.06059) + softplus_shift = tf_util.constant(value=0.2, dtype='float') + log_two = tf_util.constant(value=np.log(2.0), dtype='float') + stddev = (tf.nn.softplus(features=softplus_stddev) + softplus_shift) / \ + (log_two + softplus_shift) + + # Divide stddev to reduce zero value to 0.25 (TODO: 0.25 random choice) + if self.action_spec.min_value is not None and self.action_spec.max_value is not None: + stddev *= tf_util.constant(value=0.25, dtype='float') + + # Log stddev + log_stddev = tf.math.log(x=(stddev + epsilon)) + + return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev) + + @tf_function(num_args=1) + def mode(self, *, parameters, independent): + mean, stddev = parameters.get(('mean', 'stddev')) + + # Distribution parameter summaries and tracking + dependencies = list() + if not independent: + def fn_summary(): + m = tf.math.reduce_mean(input_tensor=mean, axis=range(self.action_spec.rank + 1)) + s = tf.math.reduce_mean(input_tensor=stddev, axis=range(self.action_spec.rank + 1)) + return m, s + + prefix = 'distributions/' + self.name + names = (prefix + '-mean', prefix + '-stddev') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=mean, axis=0) + + dependencies.extend(self.track(label='distribution', name='mean', data=fn_tracking)) + + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=stddev, axis=0) + + dependencies.extend(self.track(label='distribution', name='stddev', data=fn_tracking)) + + with tf.control_dependencies(control_inputs=dependencies): + action = mean + + # Bounded transformation + if self.bounded_transform is not None: + one = tf_util.constant(value=1.0, dtype='float') + + if self.bounded_transform == 'tanh': + action = tf.math.tanh(x=action) + elif self.bounded_transform == 'clipping': + action = tf.clip_by_value(t=action, clip_value_min=-one, clip_value_max=one) + + if self.action_spec.min_value is not None and \ + self.action_spec.max_value is not None: + half = tf_util.constant(value=0.5, dtype='float') + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = min_value + (max_value - min_value) * half * (action + one) + + elif self.action_spec.min_value is not None: + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + action = tf.maximum(x=min_value, y=action) + else: + assert self.action_spec.max_value is not None + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = tf.minimum(x=max_value, y=action) + + return action + + @tf_function(num_args=2) + def sample(self, *, parameters, temperature, independent): + mean, stddev = parameters.get(('mean', 'stddev')) + + # Distribution parameter summaries and tracking + dependencies = list() + if not independent: + def fn_summary(): + m = tf.math.reduce_mean(input_tensor=mean, axis=range(self.action_spec.rank + 1)) + s = tf.math.reduce_mean(input_tensor=stddev, axis=range(self.action_spec.rank + 1)) + return m, s + + prefix = 'distributions/' + self.name + names = (prefix + '-mean', prefix + '-stddev') + dependencies.extend(self.summary( + label='distribution', name=names, data=fn_summary, step='timesteps' + )) + + # Distribution parameter tracking + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=mean, axis=0) + + dependencies.extend(self.track(label='distribution', name='mean', data=fn_tracking)) + + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=stddev, axis=0) + + dependencies.extend(self.track(label='distribution', name='stddev', data=fn_tracking)) + + def fn_mode(): + return mean + + def fn_sample(): + normal_distribution = tf.random.normal( + shape=tf.shape(input=mean), dtype=tf_util.get_dtype(type='float') + ) + return mean + stddev * temperature * normal_distribution + + with tf.control_dependencies(control_inputs=dependencies): + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + action = tf.cond(pred=(temperature < epsilon), true_fn=fn_mode, false_fn=fn_sample) + + # Bounded transformation + if self.bounded_transform is not None: + one = tf_util.constant(value=1.0, dtype='float') + + if self.bounded_transform == 'tanh': + action = tf.math.tanh(x=action) + elif self.bounded_transform == 'clipping': + action = tf.clip_by_value(t=action, clip_value_min=-one, clip_value_max=one) + + if self.action_spec.min_value is not None and \ + self.action_spec.max_value is not None: + half = tf_util.constant(value=0.5, dtype='float') + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = min_value + (max_value - min_value) * half * (action + one) + + elif self.action_spec.min_value is not None: + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + action = tf.maximum(x=min_value, y=action) + else: + assert self.action_spec.max_value is not None + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = tf.minimum(x=max_value, y=action) + + return action + + @tf_function(num_args=2) + def log_probability(self, *, parameters, action): + mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev')) + + # Inverse bounded transformation + if self.bounded_transform is not None: + if self.action_spec.min_value is not None and self.action_spec.max_value is not None: + one = tf_util.constant(value=1.0, dtype='float') + two = tf_util.constant(value=2.0, dtype='float') + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = two * (action - min_value) / (max_value - min_value) - one + + if self.bounded_transform == 'tanh': + clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float') + action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip) + action = tf_util.cast(x=tf.math.atanh(x=tf_util.float32(x=action)), dtype='float') + + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + half = tf_util.constant(value=0.5, dtype='float') + half_log_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float') - return mean, stddev, log_stddev + sq_mean_distance = tf.square(x=(action - mean)) + sq_stddev = tf.square(x=stddev) + epsilon - def state_value(self, distr_params): - _, _, log_stddev = distr_params - return -log_stddev - 0.5 * log(2.0 * pi) + log_prob = -half * sq_mean_distance / sq_stddev - log_stddev - half_log_two_pi - def state_action_value(self, distr_params, action): - mean, stddev, log_stddev = distr_params - sq_mean_distance = tf.square(x=(action - mean)) - sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) - return -0.5 * sq_mean_distance / sq_stddev - 2.0 * log_stddev - log(2.0 * pi) + if self.bounded_transform == 'tanh': + log_two = tf_util.constant(value=np.log(2.0), dtype='float') + log_prob -= two * (log_two - action - tf.math.softplus(features=(-two * action))) - def tf_sample(self, distr_params, deterministic): - mean, stddev, _ = distr_params + return log_prob - # Deterministic: mean as action - definite = mean + @tf_function(num_args=1) + def entropy(self, *, parameters): + log_stddev = parameters['log_stddev'] - # Non-deterministic: sample action using default normal distribution - normal_distribution = tf.random_normal(shape=tf.shape(input=mean)) - sampled = mean + stddev * normal_distribution + half_lg_two_pi_e = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi * np.e)), dtype='float') - return tf.where(condition=deterministic, x=definite, y=sampled) + # TODO: doesn't take into account self.bounded_transform == 'tanh' - def tf_log_probability(self, distr_params, action): - mean, stddev, log_stddev = distr_params - sq_mean_distance = tf.square(x=(action - mean)) - sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) - return -0.5 * sq_mean_distance / sq_stddev - log_stddev - 0.5 * log(2.0 * pi) + return log_stddev + half_lg_two_pi_e - def tf_entropy(self, distr_params): - _, _, log_stddev = distr_params - return log_stddev + 0.5 * log(2.0 * pi * e) + @tf_function(num_args=2) + def kl_divergence(self, *, parameters1, parameters2): + mean1, stddev1, log_stddev1 = parameters1.get(('mean', 'stddev', 'log_stddev')) + mean2, stddev2, log_stddev2 = parameters2.get(('mean', 'stddev', 'log_stddev')) - def tf_kl_divergence(self, distr_params1, distr_params2): - mean1, stddev1, log_stddev1 = distr_params1 - mean2, stddev2, log_stddev2 = distr_params2 + half = tf_util.constant(value=0.5, dtype='float') + epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_stddev_ratio = log_stddev2 - log_stddev1 sq_mean_distance = tf.square(x=(mean1 - mean2)) sq_stddev1 = tf.square(x=stddev1) - sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=util.epsilon) - - return log_stddev_ratio + 0.5 * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - 0.5 + sq_stddev2 = tf.square(x=stddev2) + epsilon + + return log_stddev_ratio + half * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - half + + @tf_function(num_args=2) + def action_value(self, *, parameters, action): + mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev')) + + # Inverse bounded transformation + if self.bounded_transform is not None: + if self.action_spec.min_value is not None and self.action_spec.max_value is not None: + one = tf_util.constant(value=1.0, dtype='float') + two = tf_util.constant(value=2.0, dtype='float') + min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') + max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') + action = two * (action - min_value) / (max_value - min_value) - one + + if self.bounded_transform == 'tanh': + clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float') + action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip) + action = tf.math.atanh(x=action) + + half = tf_util.constant(value=0.5, dtype='float') + two = tf_util.constant(value=2.0, dtype='float') + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + log_two_pi = tf_util.constant(value=(np.log(2.0 * np.pi)), dtype='float') + # TODO: why no e here, but for entropy? - def tf_regularization_loss(self): - regularization_loss = super(Gaussian, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - regularization_loss = self.mean.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + sq_mean_distance = tf.square(x=(action - mean)) + sq_stddev = tf.square(x=stddev) + epsilon - regularization_loss = self.log_stddev.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + action_value = -half * sq_mean_distance / sq_stddev - two * log_stddev - log_two_pi - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None + # Probably not needed? + # if self.bounded_transform == 'tanh': + # log_two = tf_util.constant(value=np.log(2.0), dtype='float') + # action_value -= two * (log_two - action - tf.math.softplus(features=(-two * action))) - def get_variables(self, include_nontrainable=False): - distribution_variables = super(Gaussian, self).get_variables(include_nontrainable=include_nontrainable) - mean_variables = self.mean.get_variables(include_nontrainable=include_nontrainable) - log_stddev_variables = self.log_stddev.get_variables(include_nontrainable=include_nontrainable) + return action_value - return distribution_variables + mean_variables + log_stddev_variables + @tf_function(num_args=1) + def state_value(self, *, parameters): + log_stddev = parameters['log_stddev'] - def get_summaries(self): - distribution_summaries = super(Gaussian, self).get_summaries() - mean_summaries = self.mean.get_summaries() - log_stddev_summaries = self.log_stddev.get_summaries() + half_lg_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float') + # TODO: why no e here, but for entropy? - return distribution_summaries + mean_summaries + log_stddev_summaries + return -log_stddev - half_lg_two_pi diff --git a/tensorforce/core/explorations/__init__.py b/tensorforce/core/explorations/__init__.py deleted file mode 100755 index 89c76cef3..000000000 --- a/tensorforce/core/explorations/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from tensorforce.core.explorations.exploration import Exploration -from tensorforce.core.explorations.constant import Constant -from tensorforce.core.explorations.epsilon_anneal import EpsilonAnneal -from tensorforce.core.explorations.epsilon_decay import EpsilonDecay -from tensorforce.core.explorations.gaussian_noise import GaussianNoise -from tensorforce.core.explorations.ornstein_uhlenbeck_process import OrnsteinUhlenbeckProcess - - -explorations = dict( - constant=Constant, - epsilon_anneal=EpsilonAnneal, - epsilon_decay=EpsilonDecay, - gaussian_noise=GaussianNoise, - ornstein_uhlenbeck=OrnsteinUhlenbeckProcess -) - - -__all__ = [ - 'Exploration', - 'Constant', - 'EpsilonAnneal', - 'EpsilonDecay', - 'GaussianNoise', - 'OrnsteinUhlenbeckProcess', - 'explorations' -] diff --git a/tensorforce/core/explorations/epsilon_anneal.py b/tensorforce/core/explorations/epsilon_anneal.py deleted file mode 100644 index 9975c8f30..000000000 --- a/tensorforce/core/explorations/epsilon_anneal.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.explorations import Exploration - - -class EpsilonAnneal(Exploration): - """ - Annealing epsilon parameter based on ratio of current timestep to total timesteps. - """ - - def __init__( - self, - initial_epsilon=1.0, - final_epsilon=0.1, - timesteps=10000, - start_timestep=0, - scope='epsilon_anneal', - summary_labels=() - ): - self.initial_epsilon = initial_epsilon - self.final_epsilon = final_epsilon - self.timesteps = timesteps - self.start_timestep = start_timestep - - super(EpsilonAnneal, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_explore(self, episode, timestep, action_spec=None): - - def true_fn(): - # Know if first is not true second must be true from outer cond check. - return tf.cond( - pred=(timestep < self.start_timestep), - true_fn=(lambda: self.initial_epsilon), - false_fn=(lambda: self.final_epsilon) - ) - - def false_fn(): - completed_ratio = (tf.cast(x=timestep, dtype=util.tf_dtype('float')) - self.start_timestep) / self.timesteps - return self.initial_epsilon + completed_ratio * (self.final_epsilon - self.initial_epsilon) - - pred = tf.logical_or(x=(timestep < self.start_timestep), y=(timestep > self.start_timestep + self.timesteps)) - return tf.cond(pred=pred, true_fn=true_fn, false_fn=false_fn) diff --git a/tensorforce/core/explorations/epsilon_decay.py b/tensorforce/core/explorations/epsilon_decay.py deleted file mode 100755 index 533ba85b8..000000000 --- a/tensorforce/core/explorations/epsilon_decay.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.explorations import Exploration - - -class EpsilonDecay(Exploration): - """ - Exponentially decaying epsilon parameter based on ratio of - difference between current and final epsilon to total timesteps. - """ - - def __init__( - self, - initial_epsilon=1.0, - final_epsilon=0.1, - timesteps=10000, - start_timestep=0, - half_lives=10, - scope='epsilon_anneal', - summary_labels=() - ): - self.initial_epsilon = initial_epsilon - self.final_epsilon = final_epsilon - self.timesteps = timesteps - self.start_timestep = start_timestep - self.half_life = timesteps / half_lives - - super(EpsilonDecay, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_explore(self, episode=0, timestep=0, action_spec=None): - - def true_fn(): - # Know if first is not true second must be true from outer cond check. - return tf.cond( - pred=(timestep < self.start_timestep), - true_fn=(lambda: self.initial_epsilon), - false_fn=(lambda: self.final_epsilon) - ) - - def false_fn(): - half_life_ratio = (tf.cast(x=timestep, dtype=util.tf_dtype('float')) - self.start_timestep) / self.half_life - epsilon = self.final_epsilon + (2 ** (-half_life_ratio)) * (self.initial_epsilon - self.final_epsilon) - return epsilon - - pred = tf.logical_or(x=(timestep < self.start_timestep), - y=(timestep > self.start_timestep + int(self.timesteps))) - return tf.cond(pred=pred, true_fn=true_fn, false_fn=false_fn) diff --git a/tensorforce/core/explorations/exploration.py b/tensorforce/core/explorations/exploration.py deleted file mode 100755 index ecba24530..000000000 --- a/tensorforce/core/explorations/exploration.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf -from tensorforce import util -import tensorforce.core.explorations - - -class Exploration(object): - """ - Abstract exploration object. - """ - - def __init__(self, scope='exploration', summary_labels=None): - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.variables[name] = variable - return variable - - self.explore = tf.make_template( - name_=(scope + '/explore'), - func_=self.tf_explore, - custom_getter_=custom_getter - ) - - def tf_explore(self, episode, timestep, action_spec): - """ - Creates exploration value, e.g. compute an epsilon for epsilon-greedy or sample normal - noise. - """ - raise NotImplementedError - - def get_variables(self): - """ - Returns exploration variables. - - Returns: - List of variables. - """ - return [self.variables[key] for key in sorted(self.variables)] - - @staticmethod - def from_spec(spec): - """ - Creates an exploration object from a specification dict. - """ - exploration = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.explorations.explorations - ) - assert isinstance(exploration, Exploration) - return exploration diff --git a/tensorforce/core/explorations/gaussian_noise.py b/tensorforce/core/explorations/gaussian_noise.py deleted file mode 100755 index 3e01492cc..000000000 --- a/tensorforce/core/explorations/gaussian_noise.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf - -from tensorforce.core.explorations import Exploration - - -class GaussianNoise(Exploration): - """ - Explores via gaussian noise. - """ - - def __init__( - self, - sigma=0.3, - mu=0.0, - scope='gaussian_noise', - summary_labels=() - ): - """ - Initializes distribution values for gaussian noise - """ - self.sigma = sigma - self.mu = float(mu) # need to add cast to float to avoid tf type-mismatch error in case mu=0.0 - - super(GaussianNoise, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_explore(self, episode, timestep, action_spec): - return tf.random_normal(shape=action_spec['shape'], mean=self.mu, stddev=self.sigma) diff --git a/tensorforce/core/explorations/ornstein_uhlenbeck_process.py b/tensorforce/core/explorations/ornstein_uhlenbeck_process.py deleted file mode 100755 index 6bc787a9f..000000000 --- a/tensorforce/core/explorations/ornstein_uhlenbeck_process.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.explorations import Exploration - - -class OrnsteinUhlenbeckProcess(Exploration): - """ - Explores via an Ornstein-Uhlenbeck process. - """ - - def __init__( - self, - sigma=0.3, - mu=0.0, - theta=0.15, - scope='ornstein_uhlenbeck', - summary_labels=() - ): - """ - Initializes an Ornstein-Uhlenbeck process which is a mean reverting stochastic process - introducing time-correlated noise. - """ - self.sigma = sigma - self.mu = float(mu) # need to add cast to float to avoid tf type-mismatch error in case mu=0.0 - self.theta = theta - - super(OrnsteinUhlenbeckProcess, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_explore(self, episode, timestep, action_spec): - normal_sample = tf.random_normal(shape=action_spec['shape'], mean=0.0, stddev=1.0) - state = tf.get_variable( - name='ornstein_uhlenbeck', - dtype=util.tf_dtype('float'), - shape=action_spec['shape'], - initializer=tf.constant_initializer(self.mu) - ) - return tf.assign_add(ref=state, value=(self.theta * (self.mu - state) + self.sigma * normal_sample)) diff --git a/tensorforce/core/layers/__init__.py b/tensorforce/core/layers/__init__.py new file mode 100644 index 000000000..2f3f27651 --- /dev/null +++ b/tensorforce/core/layers/__init__.py @@ -0,0 +1,76 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core.layers.layer import Block, Layer, MultiInputLayer, NondeterministicLayer, \ + Register, Retrieve, Reuse, StatefulLayer, TemporalLayer, TransformationBase + +# Require Layer +from tensorforce.core.layers.convolution import Conv1d, Conv2d, Conv1dTranspose, Conv2dTranspose +from tensorforce.core.layers.dense import Dense +from tensorforce.core.layers.embedding import Embedding +from tensorforce.core.layers.input_rnn import InputGru, InputLstm, InputRnn +from tensorforce.core.layers.keras import KerasLayer +from tensorforce.core.layers.misc import Activation, Dropout, Function, Reshape +from tensorforce.core.layers.normalization import BatchNormalization, ExponentialNormalization, \ + InstanceNormalization, LinearNormalization +from tensorforce.core.layers.pooling import Flatten, Pooling, Pool1d, Pool2d +from tensorforce.core.layers.preprocessing import Clipping, Deltafier, Image, PreprocessingLayer, \ + Sequence +from tensorforce.core.layers.rnn import Gru, Lstm, Rnn + +# Require Dense, Conv1d, Conv2d +from tensorforce.core.layers.linear import Linear + +# Require Linear +from tensorforce.core.layers.attention import SelfAttention + + +layer_modules = dict( + activation=Activation, + batch_normalization=BatchNormalization, block=Block, + clipping=Clipping, conv1d=Conv1d, conv2d=Conv2d, conv1d_transpose=Conv1dTranspose, + conv2d_transpose=Conv2dTranspose, + default=Function, deltafier=Deltafier, dense=Dense, dropout=Dropout, + embedding=Embedding, exponential_normalization=ExponentialNormalization, + flatten=Flatten, function=Function, + gru=Gru, + image=Image, input_gru=InputGru, input_lstm=InputLstm, input_rnn=InputRnn, + instance_normalization=InstanceNormalization, + keras=KerasLayer, + linear=Linear, linear_normalization=LinearNormalization, lstm=Lstm, + pooling=Pooling, pool1d=Pool1d, pool2d=Pool2d, + register=Register, reshape=Reshape, retrieve=Retrieve, reuse=Reuse, rnn=Rnn, + self_attention=SelfAttention, sequence=Sequence +) + + +__all__ = [ + 'Activation', + 'BatchNormalization', 'Block', + 'Clipping', 'Conv1d', 'Conv2d', 'Conv1dTranspose', 'Conv2dTranspose', + 'Deltafier', 'Dense', 'Dropout', + 'Embedding', 'ExponentialNormalization', + 'Flatten', 'Function', + 'GRU', + 'Image', 'InputGru', 'InputLstm', 'InputRnn', 'InstanceNormalization', + 'KerasLayer', + 'Layer', 'layer_modules', 'Linear', 'LinearNormalization', 'Lstm', + 'MultiInputLayer', + 'NondeterministicLayer', + 'Pooling', 'Pool1d', 'Pool2d', 'PreprocessingLayer', + 'Register', 'Reshape', 'Retrieve', 'Reuse', 'Rnn', + 'Sequence', 'StatefulLayer', + 'TemporalLayer', 'TransformationBase' +] diff --git a/tensorforce/core/layers/attention.py b/tensorforce/core/layers/attention.py new file mode 100644 index 000000000..64144f7d6 --- /dev/null +++ b/tensorforce/core/layers/attention.py @@ -0,0 +1,137 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError, util +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import Linear, TransformationBase + + +class SelfAttention(TransformationBase): + """ + Self-attention layer (specification key: `self_attention`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + attention_size (int > 0): Query/key size + (default: same as output size). + bias (bool): Whether to add a trainable bias variable + (default: false). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: none). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, attention_size=None, bias=False, activation=None, dropout=0.0, + initialization_scale=1.0, vars_trainable=True, l2_regularization=None, name=None, + input_spec=None + ): + super().__init__( + size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, l2_regularization=l2_regularization, name=name, + input_spec=input_spec + ) + + self.attention_size = size if attention_size is None else attention_size + + if input_spec.rank <= 1: + raise TensorforceError.value( + name='SelfAttention', argument='input_spec[shape]', value=str(input_spec.shape), + hint='is not rank >= 2' + ) + + self.query = self.submodule( + name='query', module=Linear, size=self.attention_size, bias=bias, + vars_trainable=vars_trainable, input_spec=input_spec + ) + self.key = self.submodule( + name='key', module=Linear, size=self.attention_size, bias=bias, + vars_trainable=vars_trainable, input_spec=input_spec + ) + self.value = self.submodule( + name='value', module=Linear, size=size, bias=bias, + initialization_scale=initialization_scale, vars_trainable=vars_trainable, + input_spec=input_spec + ) + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['attention_size'] = str(self.attention_size) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + def output_spec(self): + return self.value.output_spec() + + @tf_function(num_args=1) + def apply(self, *, x): + queries = self.query.apply(x=x) + keys = self.key.apply(x=x) + values = self.value.apply(x=x) + + if self.input_spec.rank > 2: + batch_size = tf_util.cast(x=tf.shape(input=x)[:1], dtype='int') + + flattened_shape = tf_util.constant( + value=(util.product(xs=self.input_spec.shape[:-1]), self.attention_size), + dtype='int' + ) + flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) + queries = tf.reshape(tensor=queries, shape=flattened_shape) + keys = tf.reshape(tensor=keys, shape=flattened_shape) + + flattened_shape = tf_util.constant( + value=(util.product(xs=self.input_spec.shape[:-1]), self.size), dtype='int' + ) + flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) + values = tf.reshape(tensor=values, shape=flattened_shape) + + attention = tf.linalg.matmul(a=queries, b=keys, transpose_b=True) + attention = attention / tf_util.constant(value=np.sqrt(self.attention_size), dtype='float') + attention = tf.nn.softmax(logits=attention, axis=-1) + x = tf.linalg.matmul(a=attention, b=values) + + if self.input_spec.rank > 2: + shape = tf_util.constant(value=self.output_spec().shape, dtype='int') + shape = tf.concat(values=(batch_size, shape), axis=0) + x = tf.reshape(tensor=x, shape=shape) + + return super().apply(x=x) diff --git a/tensorforce/core/layers/convolution.py b/tensorforce/core/layers/convolution.py new file mode 100644 index 000000000..0550c7cea --- /dev/null +++ b/tensorforce/core/layers/convolution.py @@ -0,0 +1,580 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from tensorflow.python.keras.utils.conv_utils import conv_output_length, deconv_output_length + +from tensorforce import TensorforceError, util +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import TransformationBase + + +class Conv1d(TransformationBase): + """ + 1-dimensional convolutional layer (specification key: `conv1d`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + window (int > 0): Window size + (default: 3). + stride (int > 0): Stride size + (default: 1). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + dilation (int > 0 | (int > 0, int > 0)): Dilation value + (default: 1). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: relu). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu', + dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None, + name=None, input_spec=None + ): + self.window = window + self.stride = stride + self.padding = padding + self.dilation = dilation + + super().__init__( + size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, l2_regularization=l2_regularization, name=name, + input_spec=input_spec + ) + + self.initialization_scale = initialization_scale + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['window'] = str(window) + self.architecture_kwargs['padding'] = str(padding) + if stride != 1: + self.architecture_kwargs['stride'] = str(stride) + if dilation != 1: + self.architecture_kwargs['dilation'] = str(dilation) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + length = conv_output_length( + input_length=output_spec.shape[0], filter_size=self.window, padding=self.padding, + stride=self.stride, dilation=self.dilation + ) + + if self.squeeze: + output_spec.shape = (length,) + else: + output_spec.shape = (length, self.size) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + in_size = self.input_spec.shape[1] + + initializer = 'orthogonal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='weights', spec=TensorSpec(type='float', shape=(self.window, in_size, self.size)), + initializer=initializer, initialization_scale=self.initialization_scale, + is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf.nn.conv1d( + input=x, filters=self.weights, stride=self.stride, padding=self.padding.upper(), + dilations=self.dilation + ) + + return super().apply(x=x) + + +class Conv2d(TransformationBase): + """ + 2-dimensional convolutional layer (specification key: `conv2d`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + window (int > 0 | (int > 0, int > 0)): Window size + (default: 3). + stride (int > 0 | (int > 0, int > 0)): Stride size + (default: 1). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + dilation (int > 0 | (int > 0, int > 0)): Dilation value + (default: 1). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: "relu"). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu', + dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None, + name=None, input_spec=None + ): + if isinstance(window, int): + self.window = (window, window) + elif util.is_iterable(x=window) and len(window) == 2: + self.window = tuple(window) + else: + raise TensorforceError.type(name='Conv2d', argument='window', dtype=type(window)) + + if isinstance(stride, int): + self.stride = (1, stride, stride, 1) + elif util.is_iterable(x=stride) and len(stride) == 2: + self.stride = (1, stride[0], stride[1], 1) + else: + raise TensorforceError.type(name='Conv2d', argument='stride', dtype=type(stride)) + + self.padding = padding + + if isinstance(dilation, int): + self.dilation = (1, dilation, dilation, 1) + elif util.is_iterable(x=dilation) and len(dilation) == 2: + self.dilation = (1, dilation[0], dilation[1], 1) + else: + raise TensorforceError.type(name='Conv2d', argument='dilation', dtype=type(dilation)) + + super().__init__( + name=name, size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, input_spec=input_spec, + l2_regularization=l2_regularization + ) + + self.initialization_scale = initialization_scale + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['window'] = str(window) + self.architecture_kwargs['padding'] = str(padding) + if stride != 1: + self.architecture_kwargs['stride'] = str(stride) + if dilation != 1: + self.architecture_kwargs['dilation'] = str(dilation) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + height = conv_output_length( + input_length=output_spec.shape[0], filter_size=self.window[0], padding=self.padding, + stride=self.stride[1], dilation=self.dilation[1] + ) + width = conv_output_length( + input_length=output_spec.shape[1], filter_size=self.window[1], padding=self.padding, + stride=self.stride[2], dilation=self.dilation[2] + ) + + if self.squeeze: + output_spec.shape = (height, width) + else: + output_spec.shape = (height, width, self.size) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + in_size = self.input_spec.shape[2] + + initializer = 'orthogonal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='weights', + spec=TensorSpec(type='float', shape=(self.window + (in_size, self.size))), + initializer=initializer, initialization_scale=self.initialization_scale, + is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf.nn.conv2d( + input=x, filters=self.weights, strides=self.stride, padding=self.padding.upper(), + dilations=self.dilation + ) + + return super().apply(x=x) + + +class Conv1dTranspose(TransformationBase): + """ + 1-dimensional transposed convolutional layer, also known as deconvolution layer + (specification key: `deconv1d`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + window (int > 0): Window size + (default: 3). + output_width (int > 0): Output width + (default: same as input). + stride (int > 0): Stride size + (default: 1). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + dilation (int > 0 | (int > 0, int > 0)): Dilation value + (default: 1). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: "relu"). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, window=3, output_width=None, stride=1, padding='same', dilation=1, bias=True, + activation='relu', dropout=0.0, initialization_scale=1.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None + ): + self.window = window + if output_width is None: + self.output_shape = None + elif self.squeeze: + self.output_shape = (output_width, max(1, size)) + self.stride = stride + self.padding = padding + self.dilation = dilation + + super().__init__( + name=name, size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, input_spec=input_spec, + l2_regularization=l2_regularization + ) + + self.initialization_scale = initialization_scale + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['window'] = str(window) + if output_width is not None: + self.architecture_kwargs['output_width'] = str(output_width) + self.architecture_kwargs['padding'] = str(padding) + if stride != 1: + self.architecture_kwargs['stride'] = str(stride) + if dilation != 1: + self.architecture_kwargs['dilation'] = str(dilation) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + width = deconv_output_length( + input_length=output_spec.shape[0], filter_size=self.window, padding=self.padding, + stride=self.stride, dilation=self.dilation + ) + + if self.output_shape is None: + self.output_shape = (width, self.size) + + if self.squeeze: + output_spec.shape = self.output_shape[:1] + else: + output_spec.shape = self.output_shape + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + in_size = self.input_spec.shape[1] + + initializer = 'orthogonal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='weights', spec=TensorSpec(type='float', shape=(self.window, in_size, self.size)), + initializer=initializer, initialization_scale=self.initialization_scale, + is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + output_shape = tf.concat(values=[ + tf_util.cast(x=tf.shape(input=x)[:1], dtype='int'), + tf_util.constant(value=self.output_shape, dtype='int') + ], axis=0) + x = tf.nn.conv1d_transpose( + input=x, filters=self.weights, output_shape=tf_util.int32(x=output_shape), + strides=self.stride, padding=self.padding.upper(), dilations=self.dilation + ) + + return super().apply(x=x) + + +class Conv2dTranspose(TransformationBase): + """ + 2-dimensional transposed convolutional layer, also known as deconvolution layer + (specification key: `deconv2d`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + window (int > 0 | (int > 0, int > 0)): Window size + (default: 3). + output_shape (int > 0 | (int > 0, int > 0)): Output shape + (default: same as input). + stride (int > 0 | (int > 0, int > 0)): Stride size + (default: 1). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + dilation (int > 0 | (int > 0, int > 0)): Dilation value + (default: 1). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: "relu"). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, window=3, output_shape=None, stride=1, padding='same', dilation=1, bias=True, + activation='relu', dropout=0.0, initialization_scale=1.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None + ): + if isinstance(window, int): + self.window = (window, window) + elif util.is_iterable(x=window) and len(window) == 2: + self.window = tuple(window) + else: + raise TensorforceError.type( + name='Conv2dTranspose', argument='window', dtype=type(window) + ) + + if output_shape is None: + self.output_shape = None + elif isinstance(output_shape, int): + self.output_shape = (output_shape, output_shape, max(1, size)) + elif util.is_iterable(x=window) and len(output_shape) == 2: + self.output_shape = (output_shape[0], output_shape[1], max(1, size)) + else: + raise TensorforceError.type( + name='Conv2dTranspose', argument='window', dtype=type(output_shape) + ) + + if isinstance(stride, int): + self.stride = (1, stride, stride, 1) + elif util.is_iterable(x=stride) and len(stride) == 2: + self.stride = (1, stride[0], stride[1], 1) + else: + raise TensorforceError.type( + name='Conv2dTranspose', argument='stride', dtype=type(stride) + ) + + self.padding = padding + + if isinstance(dilation, int): + self.dilation = (1, dilation, dilation, 1) + elif len(dilation) == 2: + self.dilation = (1, dilation[0], dilation[1], 1) + else: + raise TensorforceError.type( + name='Conv2dTranspose', argument='dilation', dtype=type(dilation) + ) + + super().__init__( + name=name, size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, input_spec=input_spec, + l2_regularization=l2_regularization + ) + + self.initialization_scale = initialization_scale + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['window'] = str(window) + if output_shape is not None: + self.architecture_kwargs['output_width'] = str(output_shape) + self.architecture_kwargs['padding'] = str(padding) + if stride != 1: + self.architecture_kwargs['stride'] = str(stride) + if dilation != 1: + self.architecture_kwargs['dilation'] = str(dilation) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + height = deconv_output_length( + input_length=output_spec.shape[0], filter_size=self.window[0], padding=self.padding, + stride=self.stride[1], dilation=self.dilation[1] + ) + width = deconv_output_length( + input_length=output_spec.shape[1], filter_size=self.window[1], padding=self.padding, + stride=self.stride[2], dilation=self.dilation[2] + ) + + if self.output_shape is None: + self.output_shape = (height, width, self.size) + + if self.squeeze: + output_spec.shape = self.output_shape[: 2] + else: + output_spec.shape = self.output_shape + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + in_size = self.input_spec.shape[2] + + initializer = 'orthogonal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='weights', + spec=TensorSpec(type='float', shape=(self.window + (in_size, self.size))), + initializer=initializer, initialization_scale=self.initialization_scale, + is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + output_shape = tf.concat(values=[ + tf_util.cast(x=tf.shape(input=x)[:1], dtype='int'), + tf_util.constant(value=self.output_shape, dtype='int') + ], axis=0) + x = tf.nn.conv2d_transpose( + input=x, filters=self.weights, output_shape=tf_util.int32(x=output_shape), + strides=self.stride, padding=self.padding.upper(), dilations=self.dilation + ) + + return super().apply(x=x) diff --git a/tensorforce/core/layers/dense.py b/tensorforce/core/layers/dense.py new file mode 100644 index 000000000..aae431a05 --- /dev/null +++ b/tensorforce/core/layers/dense.py @@ -0,0 +1,107 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import TensorSpec, tf_function +from tensorforce.core.layers import TransformationBase + + +class Dense(TransformationBase): + """ + Dense fully-connected layer (specification key: `dense`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, bias=True, activation='tanh', dropout=0.0, initialization_scale=1.0, + vars_trainable=True, l2_regularization=None, name=None, input_spec=None + ): + super().__init__( + size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, l2_regularization=l2_regularization, name=name, + input_spec=input_spec + ) + + self.initialization_scale = initialization_scale + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0,)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.squeeze: + output_spec.shape = output_spec.shape[:-1] + else: + output_spec.shape = output_spec.shape[:-1] + (self.size,) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + in_size = self.input_spec.shape[0] + + initializer = 'orthogonal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='weights', spec=TensorSpec(type='float', shape=(in_size, self.size)), + initializer=initializer, initialization_scale=self.initialization_scale, + is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf.linalg.matmul(a=x, b=self.weights) + + return super().apply(x=x) diff --git a/tensorforce/core/layers/embedding.py b/tensorforce/core/layers/embedding.py new file mode 100644 index 000000000..8eee95aa1 --- /dev/null +++ b/tensorforce/core/layers/embedding.py @@ -0,0 +1,133 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import TransformationBase + + +class Embedding(TransformationBase): + """ + Embedding layer (specification key: `embedding`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + num_embeddings (int > 0): If set, specifies the number of embeddings + (default: none). + max_norm (float): If set, embeddings are clipped if their L2-norm is larger + (default: none). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, num_embeddings=None, max_norm=None, bias=True, activation='tanh', + dropout=0.0, vars_trainable=True, l2_regularization=None, name=None, input_spec=None + ): + super().__init__( + size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, l2_regularization=l2_regularization, name=name, + input_spec=input_spec + ) + + self.num_embeddings = num_embeddings + self.max_norm = max_norm + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['num_embeddings'] = str(num_embeddings) + if max_norm is not None: + self.architecture_kwargs['max_norm'] = str(max_norm) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type=('int', 'bool'), shape=None, num_values=0) + + def output_spec(self): + output_spec = super().output_spec() + + output_spec.type = 'float' + if not self.squeeze: + if output_spec.shape is None: + output_spec.shape = (None, self.size) + else: + output_spec.shape = output_spec.shape + (self.size,) + + return output_spec + + def initialize(self): + super().initialize() + + if self.num_embeddings is None: + if self.input_spec.type == 'bool': + if self.num_embeddings is None: + self.num_embeddings = 2 + + elif self.input_spec.type == 'int': + if self.num_embeddings is None: + self.num_embeddings = self.input_spec.num_values + + if self.num_embeddings is None: + raise TensorforceError.required( + name='Embedding', argument='num_embeddings', + condition='input num_values is None' + ) + elif self.input_spec.num_values is not None and \ + self.num_embeddings < self.input_spec.num_values: + raise TensorforceError.required( + name='Embedding', argument='num_embeddings', + expected='>= input num_values' + ) + + self.architecture_kwargs['num_embeddings'] = str(self.num_embeddings) + + initializer = 'normal' + if self.activation is not None and self.activation.nonlinearity == 'relu': + initializer += '-relu' + + self.weights = self.variable( + name='embeddings', + spec=TensorSpec(type='float', shape=(self.num_embeddings, self.size)), + initializer=initializer, is_trainable=self.vars_trainable, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf_util.int32(x=x) + x = tf.nn.embedding_lookup(params=self.weights, ids=x, max_norm=self.max_norm) + + return super().apply(x=x) diff --git a/tensorforce/core/layers/input_rnn.py b/tensorforce/core/layers/input_rnn.py new file mode 100644 index 000000000..810c237c8 --- /dev/null +++ b/tensorforce/core/layers/input_rnn.py @@ -0,0 +1,227 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import TransformationBase + + +class InputRnn(TransformationBase): + """ + Recurrent neural network layer which is unrolled over a sequence input independently per + timestep, and consequently does not maintain an internal state (specification key: `input_rnn`). + + Args: + cell ('gru' | 'lstm'): The recurrent cell type + (required). + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + return_final_state (bool): Whether to return the final state instead of the per-step + outputs (default: true). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras RNN layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, cell, size, return_final_state=True, bias=True, activation='tanh', dropout=0.0, + vars_trainable=True, l2_regularization=None, name=None, input_spec=None, **kwargs + ): + self.cell_type = cell + self.return_final_state = return_final_state + + super().__init__( + size=size, bias=bias, activation=activation, dropout=dropout, + vars_trainable=vars_trainable, l2_regularization=l2_regularization, name=name, + input_spec=input_spec + ) + + if self.squeeze and self.return_final_state: + raise TensorforceError.value( + name='rnn', argument='return_final_state', value=return_final_state, + condition='size = 0' + ) + + if self.cell_type == 'gru': + self.rnn = tf.keras.layers.GRU( + units=self.size, return_sequences=True, return_state=True, name='rnn', + input_shape=input_spec.shape, **kwargs # , dtype=tf_util.get_dtype(type='float') + ) + elif self.cell_type == 'lstm': + if self.return_final_state: + assert self.size % 2 == 0 + size = self.size // 2 + else: + size = self.size + self.rnn = tf.keras.layers.LSTM( + units=size, return_sequences=True, return_state=True, name='rnn', + input_shape=input_spec.shape, **kwargs # , dtype=tf_util.get_dtype(type='float') + ) + else: + raise TensorforceError.value( + name='Rnn', argument='cell', value=self.cell_type, hint='not in {gru,lstm}' + ) + + self.architecture_kwargs['cell'] = cell + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['return_final_state'] = str(return_final_state) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(-1, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.squeeze: + output_spec.shape = output_spec.shape[:-1] + elif not self.return_final_state: + output_spec.shape = output_spec.shape[:-1] + (self.size,) + else: + output_spec.shape = output_spec.shape[:-2] + (self.size,) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + def initialize(self): + super().initialize() + + self.rnn.build(input_shape=((None,) + self.input_spec.shape)) + + @tf_function(num_args=0) + def regularize(self): + regularization_loss = super().regularize() + + if len(self.rnn.losses) > 0: + regularization_loss += tf.math.add_n(inputs=self.rnn.losses) + + return regularization_loss + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf_util.float32(x=x) + x = self.rnn(inputs=x, initial_state=None) + + if not self.return_final_state: + x = tf_util.cast(x=x[0], dtype='float') + elif self.cell_type == 'gru': + x = tf_util.cast(x=x[1], dtype='float') + elif self.cell_type == 'lstm': + x = tf_util.cast(x=tf.concat(values=(x[1], x[2]), axis=1), dtype='float') + + return super().apply(x=x) + + +class InputGru(InputRnn): + """ + Gated recurrent unit layer which is unrolled over a sequence input independently per timestep, + and consequently does not maintain an internal state (specification key: `input_gru`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + return_final_state (bool): Whether to return the final state instead of the per-step + outputs (default: true). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras GRU layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, size, return_final_state=True, bias=True, activation='tanh', dropout=0.0, + vars_trainable=True, l2_regularization=None, name=None, input_spec=None, **kwargs + ): + super().__init__( + cell='gru', size=size, return_final_state=return_final_state, bias=bias, + activation=activation, dropout=dropout, vars_trainable=vars_trainable, + l2_regularization=l2_regularization, name=name, input_spec=input_spec, **kwargs + ) + + +class InputLstm(InputRnn): + """ + Long short-term memory layer which is unrolled over a sequence input independently per timestep, + and consequently does not maintain an internal state (specification key: `input_lstm`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + return_final_state (bool): Whether to return the final state instead of the per-step + outputs (default: true). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras LSTM layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, size, return_final_state=True, bias=True, activation='tanh', dropout=0.0, + vars_trainable=True, l2_regularization=None, name=None, input_spec=None, **kwargs + ): + super().__init__( + cell='lstm', size=size, return_final_state=return_final_state, bias=bias, + activation=activation, dropout=dropout, vars_trainable=vars_trainable, + l2_regularization=l2_regularization, name=name, input_spec=input_spec, **kwargs + ) diff --git a/tensorforce/core/layers/keras.py b/tensorforce/core/layers/keras.py new file mode 100644 index 000000000..cb1532a44 --- /dev/null +++ b/tensorforce/core/layers/keras.py @@ -0,0 +1,78 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import tf_function, tf_util +from tensorforce.core.layers import Layer + + +class KerasLayer(Layer): + """ + Keras layer (specification key: `keras`). + + Args: + layer (string): Keras layer class name, see + `TensorFlow docs `__ + (required). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Arguments for the Keras layer, see + `TensorFlow docs `__. + """ + + def __init__(self, *, layer, l2_regularization=None, name=None, input_spec=None, **kwargs): + super().__init__(l2_regularization=l2_regularization, name=name, input_spec=input_spec) + + self.keras_layer = getattr(tf.keras.layers, layer)( + name=name, dtype=tf_util.get_dtype(type='float'), input_shape=input_spec.shape, **kwargs + ) + + self.architecture_kwargs['layer'] = str(layer) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def output_spec(self): + output_spec = super().output_spec() + + output_spec.type = 'float' + output_spec.shape = self.keras_layer.compute_output_shape( + input_shape=((None,) + output_spec.shape) + )[1:] + + return output_spec + + def initialize(self): + super().initialize() + + self.keras_layer.build(input_shape=((None,) + self.input_spec.shape)) + + @tf_function(num_args=0) + def regularize(self): + regularization_loss = super().regularize() + + if len(self.keras_layer.losses) > 0: + regularization_loss += tf.math.add_n(inputs=self.keras_layer.losses) + + return regularization_loss + + @tf_function(num_args=1) + def apply(self, *, x): + x = self.keras_layer.call(inputs=x) + + return x diff --git a/tensorforce/core/layers/layer.py b/tensorforce/core/layers/layer.py new file mode 100644 index 000000000..ce1936c7b --- /dev/null +++ b/tensorforce/core/layers/layer.py @@ -0,0 +1,843 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import Counter, OrderedDict + +import tensorflow as tf + +from tensorforce import TensorforceError, util +import tensorforce.core +from tensorforce.core import ArrayDict, Module, parameter_modules, SignatureDict, TensorSpec, \ + TensorsSpec, tf_function, tf_util +from tensorforce.core.parameters import Parameter + + +class Layer(Module): + """ + Base class for neural network layers. + + Args: + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + _TF_MODULE_IGNORED_PROPERTIES = Module._TF_MODULE_IGNORED_PROPERTIES | {'_REGISTERED_LAYERS'} + + # _REGISTERED_LAYERS # Initialized as part of model.__init__() + + def __init__(self, *, l2_regularization=None, name=None, input_spec=None): + super().__init__(l2_regularization=l2_regularization, name=name) + + Layer._REGISTERED_LAYERS[self.name] = self + + self.input_spec = self.default_input_spec() + if not isinstance(self.input_spec, TensorSpec): + raise TensorforceError.unexpected() + + self.input_spec = self.input_spec.unify( + other=input_spec, name=(self.__class__.__name__ + ' input') + ) + + self.architecture_kwargs = OrderedDict() + if name is not None: + self.architecture_kwargs['name'] = name + + def get_architecture(self): + if len(self.architecture_kwargs) == 0: + return self.__class__.__name__ + else: + assert all( + isinstance(key, str) and isinstance(value, str) + for key, value in self.architecture_kwargs.items() + ) + return '{}({})'.format(self.__class__.__name__, ', '.join( + '{}={}'.format(key, value) for key, value in self.architecture_kwargs.items() + )) + + def default_input_spec(self): + return TensorSpec(type=None, shape=None, overwrite=True) + + def output_spec(self): + return self.input_spec.copy(overwrite=True) + + def submodule(self, *args, **kwargs): + layer = super().submodule(*args, **kwargs) + + if not isinstance(layer, (Layer, Parameter)): + raise TensorforceError.type(name='layer', argument='submodule', dtype=type(layer)) + + return layer + + def input_signature(self, *, function): + if function == 'apply': + return SignatureDict(x=self.input_spec.signature(batched=True)) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'apply': + return SignatureDict(singleton=self.output_spec().signature(batched=True)) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=1) + def apply(self, *, x): + raise NotImplementedError + + +class MultiInputLayer(Layer): + """ + Base class for multi-input layers. + + Args: + tensors (iter[string]): Names of tensors to retrieve, either state names or previously + registered tensors + (required). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, tensors, l2_regularization=None, name=None, input_spec=None): + super(Layer, self).__init__(l2_regularization=l2_regularization, name=name) + + Layer._REGISTERED_LAYERS[self.name] = self + + self.architecture_kwargs = OrderedDict() + if name is not None: + self.architecture_kwargs['name'] = name + + if isinstance(tensors, str): + pass + elif not util.is_iterable(x=tensors): + raise TensorforceError.type( + name='MultiInputLayer', argument='tensors', dtype=type(tensors) + ) + elif len(tensors) == 0: + raise TensorforceError.value( + name='MultiInputLayer', argument='tensors', value=tensors, hint='zero length' + ) + + if isinstance(tensors, str): + self.tensors = (tensors,) + else: + self.tensors = tuple(tensors) + + self.input_spec = self.default_input_spec() + if not isinstance(self.input_spec, TensorsSpec): + raise TensorforceError.unexpected() + + self.input_spec = self.input_spec.unify(other=input_spec) + + def default_input_spec(self): + return TensorsSpec( + ((tensor, TensorSpec(type=None, shape=None, overwrite=True)) for tensor in self.tensors) + ) + + def output_spec(self): + return TensorSpec(type=None, shape=None, overwrite=True) + + @tf_function(num_args=1) + def apply(self, *, x): + raise NotImplementedError + + +class NondeterministicLayer(Layer): + """ + Base class for nondeterministic layers. + + Args: + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def input_signature(self, *, function): + if function == 'apply': + return SignatureDict( + x=self.input_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + else: + return super().input_signature(function=function) + + @tf_function(num_args=2, overwrites_signature=True) + def apply(self, *, x, deterministic): + raise NotImplementedError + + +class Register(Layer): + """ + Tensor retrieval layer, which is useful when defining more complex network architectures which + do not follow the sequential layer-stack pattern, for instance, when handling multiple inputs + (specification key: `register`). + + Args: + tensor (string): Name under which tensor will be registered + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, tensor, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + if not isinstance(tensor, str): + raise TensorforceError.type(name='register', argument='tensor', dtype=type(tensor)) + + self.tensor = tensor + + self.architecture_kwargs['tensor'] = tensor + + @tf_function(num_args=1) + def apply(self, *, x): + return x + + +class Retrieve(MultiInputLayer): + """ + Tensor retrieval layer, which is useful when defining more complex network architectures which + do not follow the sequential layer-stack pattern, for instance, when handling multiple inputs + (specification key: `retrieve`). + + Args: + tensors (str | iter[string]): Name(s) of tensor(s) to retrieve, either state names or + previously registered tensors + (required). + aggregation ('concat' | 'product' | 'stack' | 'sum'): Aggregation type in case of multiple + tensors + (default: 'concat'). + axis (int >= 0): Aggregation axis, excluding batch axis + (default: 0). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, tensors, aggregation='concat', axis=0, name=None, input_spec=None): + super().__init__(tensors=tensors, name=name, input_spec=input_spec) + + if aggregation not in ('concat', 'product', 'stack', 'sum'): + raise TensorforceError.value( + name='retrieve', argument='aggregation', value=aggregation, + hint='not in {concat,product,stack,sum}' + ) + + self.aggregation = aggregation + self.axis = axis + + if len(self.tensors) == 1: + self.architecture_kwargs['tensor'] = self.tensors[0] + else: + self.architecture_kwargs['tensors'] = '[{}]'.format(', '.join(self.tensors)) + self.architecture_kwargs['aggregation'] = aggregation + self.architecture_kwargs['axis'] = str(axis) + + def output_spec(self): + if len(self.tensors) == 1: + return self.input_spec[self.tensors[0]] + + # Get tensor types and shapes + dtypes = list() + shapes = list() + for spec in self.input_spec.values(): + dtypes.append(spec.type) + shapes.append(spec.shape) + + # Check tensor types + if all(dtype == dtypes[0] for dtype in dtypes): + dtype = dtypes[0] + else: + raise TensorforceError.value(name='retrieve', argument='tensor types', value=dtypes) + + if self.aggregation == 'concat': + if any(len(shape) != len(shapes[0]) for shape in shapes): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + elif any( + shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape)) + if n != self.axis + ): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + shape = tuple( + sum(shape[n] for shape in shapes) if n == self.axis else shapes[0][n] + for n in range(len(shapes[0])) + ) + + elif self.aggregation == 'stack': + if any(len(shape) != len(shapes[0]) for shape in shapes): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + elif any(shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape))): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + shape = tuple( + len(shapes) if n == self.axis else shapes[0][n - int(n > self.axis)] + for n in range(len(shapes[0]) + 1) + ) + + else: + # Check and unify tensor shapes + for shape in shapes: + if len(shape) != len(shapes[0]): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + if any(x != y and x != 1 and y != 1 for x, y in zip(shape, shapes[0])): + raise TensorforceError.value( + name='retrieve', argument='tensor shapes', value=shapes + ) + shape = tuple(max(shape[n] for shape in shapes) for n in range(len(shapes[0]))) + + # TODO: Missing num_values, min/max_value + return TensorSpec(type=dtype, shape=shape) + + @tf_function(num_args=1) + def apply(self, *, x): + if len(self.tensors) == 1: + return x[self.tensors[0]] + + x = list(x.values()) + + shape = self.output_spec().shape + for n, tensor in enumerate(x): + for axis in range(tf_util.rank(x=tensor), len(shape)): + tensor = tf.expand_dims(input=tensor, axis=axis) + x[n] = tensor + + if self.aggregation == 'concat': + x = tf.concat(values=x, axis=(self.axis + 1)) + + elif self.aggregation == 'product': + x = tf.stack(values=x, axis=(self.axis + 1)) + x = tf.reduce_prod(input_tensor=x, axis=(self.axis + 1)) + + elif self.aggregation == 'stack': + x = tf.stack(values=x, axis=(self.axis + 1)) + + elif self.aggregation == 'sum': + x = tf.stack(values=x, axis=(self.axis + 1)) + x = tf.reduce_sum(input_tensor=x, axis=(self.axis + 1)) + + return x + + +class Block(Layer): + """ + Block of layers (specification key: `block`). + + Args: + layers (iter[specification]): Layers configuration, see [layers](../modules/layers.html) + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, layers, name=None, input_spec=None): + # TODO: handle internal states and combine with layered network + if len(layers) == 0: + raise TensorforceError.value( + name='block', argument='layers', value=layers, hint='zero length' + ) + + self._input_spec = input_spec + self.layers_spec = list(layers) + self.layers = list() + + super().__init__(name=name, input_spec=input_spec) + + if len(self.layers_spec) == 0: + self.architecture_kwargs['layers'] = '[]' + else: + self.architecture_kwargs['layers'] = '[\n {}\n]'.format('\n '.join( + layer.get_architecture().replace('\n', '\n ') for layer in self.layers + )) + + def default_input_spec(self): + # if not isinstance(self.layers[0], Layer): + layer_counter = Counter() + for layer_spec in self.layers_spec: + if 'name' in layer_spec: + layer_spec = dict(layer_spec) + layer_name = layer_spec.pop('name') + else: + if isinstance(layer_spec.get('type'), str): + layer_type = layer_spec['type'] + else: + layer_type = 'layer' + layer_name = layer_type + str(layer_counter[layer_type]) + layer_counter[layer_type] += 1 + + # layer_name = self.name + '-' + layer_name + if layer_spec.get('type') == 'register': + raise TensorforceError.invalid(name='block-layer', argument='register-layer') + elif layer_spec.get('type') == 'retrieve': + raise TensorforceError.invalid(name='block-layer', argument='retrieve-layer') + layer = self.submodule( + name=layer_name, module=layer_spec, modules=tensorforce.core.layer_modules, + input_spec=self._input_spec + ) + self.layers.append(layer) + self._input_spec = layer.output_spec() + + return self.layers[0].input_spec.copy() + + def output_spec(self): + return self.layers[-1].output_spec() + + def apply(self, *, x): + raise NotImplementedError + + +class Reuse(Layer): + """ + Reuse layer (specification key: `reuse`). + + Args: + layer (string): Name of a previously defined layer + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, layer, name=None, input_spec=None): + if layer not in Layer._REGISTERED_LAYERS: + raise TensorforceError.value(name='reuse', argument='layer', value=layer) + + self.layer = layer + + super().__init__(name=name, input_spec=input_spec, l2_regularization=0.0) + + self.architecture_kwargs['layer'] = layer + + @property + def reused_layer(self): + return Layer._REGISTERED_LAYERS[self.layer] + + def get_architecture(self): + return '{}\n > {}'.format( + super().get_architecture(), + self.reused_layer.get_architecture().replace('\n', '\n > ') + ) + + def default_input_spec(self): + return self.reused_layer.input_spec.copy() + + def output_spec(self): + return self.reused_layer.output_spec() + + def apply(self, *, x): + raise NotImplementedError + + # TODO: other Module functions? + def get_available_summaries(self): + summaries = super().get_available_summaries() + summaries.update(self.reused_layer.get_available_summaries()) + return sorted(summaries) + + +class StatefulLayer(Layer): # TODO: WeaklyStatefulLayer ? + """ + Base class for stateful layers, i.e. layers which update an internal state for on-policy calls. + + Args: + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + @tf_function(num_args=1) + def apply(self, *, x, independent): + raise NotImplementedError + + +class TemporalLayer(Layer): + """ + Base class for temporal layers, i.e. layers whose output depends on previous states. + + Args: + temporal_processing ('cumulative' | 'iterative'): Temporal processing type + (required). + horizon (parameter, int >= 0): Past horizon + (required). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for potential parent class. + """ + + def __init__( + self, *, temporal_processing, horizon, l2_regularization=None, name=None, input_spec=None, + **kwargs + ): + if temporal_processing not in ('cumulative', 'iterative'): + raise TensorforceError.value( + name='temporal-layer', argument='temporal_processing', value=temporal_processing, + hint='not in {cumulative,iterative}' + ) + self.temporal_processing = temporal_processing + + super().__init__( + l2_regularization=l2_regularization, name=name, input_spec=input_spec, **kwargs + ) + + if self.temporal_processing == 'cumulative' and len(self.internals_spec) > 0: + raise TensorforceError.invalid( + name='temporal-layer', argument='temporal_processing', expected='iterative', + condition='num internals > 0' + ) + + if horizon is None: + horizon = 0 + self.horizon = self.submodule( + name='horizon', module=horizon, modules=parameter_modules, is_trainable=False, + dtype='int', min_value=0 + ) + + @property + def internals_spec(self): + return TensorsSpec() + + def internals_init(self): + return ArrayDict() + + def max_past_horizon(self, *, on_policy): + if self.temporal_processing == 'iterative' and on_policy: + return 0 + else: + return self.horizon.max_value() + + def input_signature(self, *, function): + if function == 'apply': + assert len(self.internals_spec) == 0 or self.temporal_processing == 'iterative' + return SignatureDict( + x=self.input_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'cumulative_apply': + assert self.temporal_processing == 'cumulative' + cumulative_input_spec = self.input_spec.copy() + cumulative_input_spec.shape = (None,) + cumulative_input_spec.shape + return SignatureDict( + xs=cumulative_input_spec.signature(batched=True), + lengths=TensorSpec(type='int', shape=()).signature(batched=True) + ) + + elif function == 'iterative_apply': + assert self.temporal_processing == 'iterative' + return SignatureDict( + x=self.input_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'iterative_body': + assert self.temporal_processing == 'iterative' + return SignatureDict( + x=self.input_spec.signature(batched=True), + indices=TensorSpec(type='int', shape=()).signature(batched=True), + remaining=TensorSpec(type='int', shape=()).signature(batched=True), + current_x=self.output_spec().signature(batched=True), + current_internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'past_horizon': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'apply': + if self.temporal_processing == 'cumulative': + return SignatureDict(singleton=self.output_spec().signature(batched=True)) + elif self.temporal_processing == 'iterative': + return SignatureDict( + x=self.output_spec().signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'cumulative_apply': + assert self.temporal_processing == 'cumulative' + return SignatureDict(singleton=self.output_spec().signature(batched=True)) + + elif function == 'iterative_apply': + assert self.temporal_processing == 'iterative' + return SignatureDict( + x=self.output_spec().signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'iterative_body': + assert self.temporal_processing == 'iterative' + return SignatureDict( + x=self.input_spec.signature(batched=True), + indices=TensorSpec(type='int', shape=()).signature(batched=True), + remaining=TensorSpec(type='int', shape=()).signature(batched=True), + current_x=self.output_spec().signature(batched=True), + current_internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'past_horizon': + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=0) + def past_horizon(self, *, on_policy): + if self.temporal_processing == 'iterative' and on_policy: + return tf_util.constant(value=0, dtype='int') + else: + return self.horizon.value() + + @tf_function(num_args=3, overwrites_signature=True) + def apply(self, *, x, horizons, internals): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + batch_size = tf_util.cast(x=tf.shape(input=horizons)[:1], dtype='int') + zeros = tf_util.zeros(shape=batch_size, dtype='int') + ones = tf_util.ones(shape=batch_size, dtype='int') + batch_size = batch_size[0] + + # including 0th step + horizon = self.horizon.value() + one + # in case of longer horizon than necessary (e.g. main vs baseline policy) + starts = horizons[:, 0] + tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) + lengths = horizons[:, 1] - tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) + horizon = tf.minimum(x=horizon, y=tf.math.reduce_max(input_tensor=lengths, axis=0)) + output_spec = self.output_spec() + + if self.temporal_processing == 'cumulative': + if self.horizon.is_constant(value=0): + x = self.iterative_apply(xs=x, lengths=ones) + + else: + def body(x, indices, remaining, xs): + current_x = tf.gather(params=x, indices=indices) + current_x = tf.expand_dims(input=current_x, axis=1) + xs = tf.concat(values=(xs, current_x), axis=1) + remaining -= tf.where( + condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones + ) + indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) + return x, indices, remaining, xs + + initial_xs = tf_util.zeros( + shape=((batch_size, 0) + output_spec.shape), dtype=output_spec.type + ) + + _, final_indices, final_remaining, xs = tf.while_loop( + cond=tf_util.always_true, body=body, loop_vars=(x, starts, lengths, initial_xs), + maximum_iterations=tf_util.int64(x=horizon) + ) + + x = self.cumulative_apply(xs=xs, lengths=lengths) + + elif self.temporal_processing == 'iterative': + if self.horizon.is_constant(value=0): + x, final_internals = self.iterative_apply(x=x, internals=internals) + + else: + shape = tf.concat(values=[ + tf.expand_dims(input=batch_size, axis=0), + tf_util.constant(value=output_spec.shape, dtype='int') + ], axis=0) + initial_x = tf_util.zeros(shape=shape, dtype=output_spec.type) + + signature = self.input_signature(function='iterative_body') + internals = signature['current_internals'].kwargs_to_args(kwargs=internals) + _, final_indices, final_remaining, x, final_internals = tf.while_loop( + cond=tf_util.always_true, body=self.iterative_body, + loop_vars=(x, starts, lengths, initial_x, internals), + maximum_iterations=tf_util.int32(x=horizon) + ) + internals = signature['current_internals'].args_to_kwargs(args=final_internals) + + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_equal( + x=final_indices, y=(tf.math.cumsum(x=lengths) - ones) + )) + assertions.append(tf.debugging.assert_equal( + x=tf.math.reduce_sum(input_tensor=final_remaining), y=zero + )) + + with tf.control_dependencies(control_inputs=assertions): + if self.temporal_processing == 'cumulative': + return tf_util.identity(input=super().apply(x=x)) + elif self.temporal_processing == 'iterative': + return tf_util.identity(input=super().apply(x=x)), internals + + @tf_function(num_args=5, is_loop_body=True) + def iterative_body(self, x, indices, remaining, current_x, current_internals): + batch_size = tf_util.cast(x=tf.shape(input=current_x)[:1], dtype='int') + zeros = tf_util.zeros(shape=batch_size, dtype='int') + ones = tf_util.ones(shape=batch_size, dtype='int') + batch_size = batch_size[0] + + current_x = tf.gather(params=x, indices=indices) + next_x, next_internals = self.iterative_apply( + x=current_x, internals=current_internals + ) + + with tf.control_dependencies(control_inputs=(current_x, next_x)): + is_finished = tf.math.equal(x=remaining, y=zeros) + if isinstance(next_internals, dict): + for name, current_internal, next_internal in current_internals.zip_items( + next_internals + ): + condition = is_finished + for _ in range(tf_util.rank(x=current_internal) - 1): + condition = tf.expand_dims(input=condition, axis=1) + next_internals[name] = tf.where( + condition=condition, x=current_internal, y=next_internal + ) + + else: + condition = is_finished + for _ in range(tf_util.rank(x=current_internals) - 1): + condition = tf.expand_dims(input=condition, axis=1) + next_internals = tf.where( + condition=condition, x=current_internals, y=next_internals + ) + + remaining -= tf.where(condition=is_finished, x=zeros, y=ones) + indices += tf.where( + condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones + ) + + return x, indices, remaining, next_x, next_internals + + @tf_function(num_args=1) + def cumulative_apply(self, *, xs, lengths): + raise NotImplementedError + + @tf_function(num_args=2) + def iterative_apply(self, *, x, internals): + raise NotImplementedError + + +class TransformationBase(Layer): + """ + Base class for transformation layers. + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + bias (bool): Whether to add a trainable bias variable + (default: false). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: none). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for potential parent class. + """ + + def __init__( + self, *, size, bias=False, activation=None, dropout=0.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None, **kwargs + ): + super().__init__( + l2_regularization=l2_regularization, name=name, input_spec=input_spec, **kwargs + ) + + self.squeeze = (size == 0) + self.size = max(size, 1) + self.bias = bias + + if activation is None: + self.activation = None + else: + self.activation = self.submodule( + name='activation', module='activation', modules=tensorforce.core.layer_modules, + nonlinearity=activation, input_spec=self.output_spec() + ) + + if dropout == 0.0: + self.dropout = None + else: + self.dropout = self.submodule( + name='dropout', module='dropout', modules=tensorforce.core.layer_modules, + rate=dropout, input_spec=self.output_spec() + ) + + self.vars_trainable = vars_trainable + + def initialize(self): + super().initialize() + + if isinstance(self.bias, str): + # Hack for Rnn to avoid name clash with Keras variable name + self.bias = self.variable( + name=self.bias, spec=TensorSpec(type='float', shape=(self.size,)), + initializer='zeros', is_trainable=self.vars_trainable, is_saved=True + ) + elif self.bias: + self.bias = self.variable( + name='bias', spec=TensorSpec(type='float', shape=(self.size,)), initializer='zeros', + is_trainable=self.vars_trainable, is_saved=True + ) + else: + self.bias = None + + @tf_function(num_args=1) + def apply(self, *, x): + if self.bias is not None: + x = tf.nn.bias_add(value=x, bias=self.bias) + + if self.squeeze: + x = tf.squeeze(input=x, axis=-1) + + if self.activation is not None: + x = self.activation.apply(x=x) + + if self.dropout is not None: + x = self.dropout.apply(x=x) + + return x diff --git a/tensorforce/core/layers/linear.py b/tensorforce/core/layers/linear.py new file mode 100644 index 000000000..549f31c11 --- /dev/null +++ b/tensorforce/core/layers/linear.py @@ -0,0 +1,94 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import tf_function, TensorSpec +from tensorforce.core.layers import Conv1d, Conv2d, Dense, Layer + + +class Linear(Layer): + """ + Linear layer (specification key: `linear`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + bias (bool): Whether to add a trainable bias variable + (default: true). + initialization_scale (float > 0.0): Initialization scale + (default: 1.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, size, bias=True, initialization_scale=1.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None + ): + super().__init__(l2_regularization=l2_regularization, name=name, input_spec=input_spec) + + if len(self.input_spec.shape) <= 1: + self.linear = self.submodule( + name='linear', module=Dense, size=size, bias=bias, activation=None, dropout=0.0, + initialization_scale=initialization_scale, vars_trainable=vars_trainable, + input_spec=self.input_spec + ) + elif len(self.input_spec.shape) == 2: + self.linear = self.submodule( + name='linear', module=Conv1d, size=size, window=1, bias=bias, activation=None, + dropout=0.0, initialization_scale=initialization_scale, + vars_trainable=vars_trainable, input_spec=self.input_spec + ) + elif len(self.input_spec.shape) == 3: + self.linear = self.submodule( + name='linear', module=Conv2d, size=size, window=1, bias=bias, activation=None, + dropout=0.0, initialization_scale=initialization_scale, + vars_trainable=vars_trainable, input_spec=self.input_spec + ) + else: + raise TensorforceError.value( + name='Linear', argument='input rank', value=len(self.input_spec.shape), hint='<= 3' + ) + + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['bias'] = str(bias) + if initialization_scale != 1.0: + self.architecture_kwargs['initialization_scale'] = str(initialization_scale) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + def output_spec(self): + return self.linear.output_spec() + + @tf_function(num_args=1) + def apply(self, *, x): + if len(self.input_spec.shape) == 0: + x = tf.expand_dims(input=x, axis=1) + + x = self.linear.apply(x=x) + + return x diff --git a/tensorforce/core/layers/misc.py b/tensorforce/core/layers/misc.py new file mode 100644 index 000000000..25a98b645 --- /dev/null +++ b/tensorforce/core/layers/misc.py @@ -0,0 +1,230 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import math +import random + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError, util +from tensorforce.core import parameter_modules, TensorSpec, tf_function, tf_util +from tensorforce.core.layers import Layer, NondeterministicLayer + + +class Activation(Layer): + """ + Activation layer (specification key: `activation`). + + Args: + nonlinearity ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Nonlinearity + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, nonlinearity, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + # Nonlinearity + if nonlinearity not in ( + 'crelu', 'elu', 'leaky-relu', 'none', 'relu', 'selu', 'sigmoid', 'softmax', 'softplus', + 'softsign', 'swish', 'tanh' + ): + raise TensorforceError.value( + name='activation', argument='nonlinearity', value=nonlinearity + ) + self.nonlinearity = nonlinearity + + self.architecture_kwargs['nonlinearity'] = nonlinearity + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + @tf_function(num_args=1) + def apply(self, *, x): + if self.nonlinearity == 'crelu': + x = tf.nn.crelu(features=x) + + elif self.nonlinearity == 'elu': + x = tf.nn.elu(features=x) + + elif self.nonlinearity == 'leaky-relu': + # TODO: make alpha public argument + x = tf.nn.leaky_relu(features=x, alpha=0.2) + + elif self.nonlinearity == 'none': + pass + + elif self.nonlinearity == 'relu': + x = tf.nn.relu(features=x) + + elif self.nonlinearity == 'selu': + x = tf.nn.selu(features=x) + + elif self.nonlinearity == 'sigmoid': + x = tf.sigmoid(x=x) + + elif self.nonlinearity == 'softmax': + x = tf.nn.softmax(logits=x) + + elif self.nonlinearity == 'softplus': + x = tf.nn.softplus(features=x) + + elif self.nonlinearity == 'softsign': + x = tf.nn.softsign(features=x) + + elif self.nonlinearity == 'swish': + # https://arxiv.org/abs/1710.05941 + x = tf.sigmoid(x=x) * x + + elif self.nonlinearity == 'tanh': + x = tf.nn.tanh(x=x) + + return x + + +class Dropout(NondeterministicLayer): + """ + Dropout layer (specification key: `dropout`). + + Args: + rate (parameter, 0.0 <= float < 1.0): Dropout rate + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, rate, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + # Rate + self.rate = self.submodule( + name='rate', module=rate, modules=parameter_modules, dtype='float', min_value=0.0, + max_value=1.0 + ) + + self.architecture_kwargs['rate'] = str(rate) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + @tf_function(num_args=2) + def apply(self, *, x, deterministic): + if self.rate.is_constant(value=0.0): + return x + + else: + rate = self.rate.value() + + def no_dropout(): + return x + + def apply_dropout(): + return tf.nn.dropout(x=x, rate=rate) + + zero = tf_util.constant(value=0.0, dtype='float') + skip_dropout = tf.math.logical_or(x=deterministic, y=tf.math.equal(x=rate, y=zero)) + return tf.cond(pred=skip_dropout, true_fn=no_dropout, false_fn=apply_dropout) + + +class Function(Layer): + """ + Custom TensorFlow function layer (specification key: `function`). + + Args: + function (callable[x -> x] | str): TensorFlow function, or string expression with argument + "x", e.g. "(x+1.0)/2.0" + (required). + output_spec (specification): Output tensor specification containing type and/or shape + information (default: same as input). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + # (requires function as first argument) + def __init__( + self, function, output_spec=None, l2_regularization=None, name=None, input_spec=None + ): + super().__init__(l2_regularization=l2_regularization, name=name, input_spec=input_spec) + + self.function = function + if output_spec is None: + self._output_spec = None + else: + self._output_spec = TensorSpec(**output_spec) + + self.architecture_kwargs['function'] = str(function) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def output_spec(self): + if self._output_spec is None: + return super().output_spec() + else: + return self._output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + if isinstance(self.function, str): + x = eval(self.function, dict(), dict(x=x, math=math, np=np, random=random, tf=tf)) + else: + x = self.function(x) + + return x + + +class Reshape(Layer): + """ + Reshape layer (specification key: `reshape`). + + Args: + shape (int | iter[int]): New shape + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, shape, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + if isinstance(shape, int): + self.shape = (shape,) + else: + self.shape = tuple(shape) + + self.architecture_kwargs['reshape'] = str(self.shape) + + def output_spec(self): + output_spec = super().output_spec() + + if output_spec.size != util.product(xs=self.shape): + raise TensorforceError.value(name='Reshape', argument='shape', value=self.shape) + output_spec.shape = self.shape + + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf.reshape(tensor=x, shape=((-1,) + self.shape)) + + return x diff --git a/tensorforce/core/layers/normalization.py b/tensorforce/core/layers/normalization.py new file mode 100644 index 000000000..273f152ec --- /dev/null +++ b/tensorforce/core/layers/normalization.py @@ -0,0 +1,330 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import parameter_modules, TensorSpec, tf_function, tf_util +from tensorforce.core.layers import Layer, StatefulLayer + + +class LinearNormalization(Layer): + """ + Linear normalization layer which scales and shifts the input to [-2.0, 2.0], for bounded states + with min/max_value (specification key: `linear_normalization`). + + Args: + min_value (float | array[float]): Lower bound of the value + (default: based on input_spec). + max_value (float | array[float]): Upper bound of the value range + (default: based on input_spec). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, min_value=None, max_value=None, name=None, input_spec=None): + if min_value is None: + if input_spec.min_value is None: + raise TensorforceError.required(name='LinearNormalization', argument='min_value') + min_value = input_spec.min_value + + if max_value is None: + if input_spec.max_value is None: + raise TensorforceError.required(name='LinearNormalization', argument='max_value') + max_value = input_spec.max_value + + self.min_value = np.asarray(min_value) + self.max_value = np.asarray(max_value) + + if (self.min_value >= self.max_value).any(): + raise TensorforceError( + name='LinearNormalization', argument='min/max_value', + value=(self.min_value, self.max_value), hint='not less than' + ) + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['min_value'] = str(min_value) + self.architecture_kwargs['max_value'] = str(max_value) + + def default_input_spec(self): + return TensorSpec( + type='float', shape=None, min_value=self.min_value, max_value=self.max_value + ) + + def output_spec(self): + output_spec = super().output_spec() + is_inf = np.logical_or(np.isinf(self.min_value), np.isinf(self.max_value)) + if is_inf.any(): + output_spec.min_value = np.where(is_inf, self.min_value, -2.0) + output_spec.max_value = np.where(is_inf, self.max_value, 2.0) + else: + output_spec.min_value = -2.0 + output_spec.max_value = 2.0 + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + is_inf = np.logical_or(np.isinf(self.min_value), np.isinf(self.max_value)) + is_inf = tf_util.constant(value=is_inf, dtype='bool') + min_value = tf_util.constant(value=self.min_value, dtype='float') + max_value = tf_util.constant(value=self.max_value, dtype='float') + + return tf.where( + condition=is_inf, x=x, y=(4.0 * (x - min_value) / (max_value - min_value) - 2.0) + ) + + +class ExponentialNormalization(StatefulLayer): + """ + Normalization layer based on the exponential moving average of mean and variance over the + temporal sequence of inputs + (specification key: `exponential_normalization`). + + Args: + decay (parameter, 0.0 <= float <= 1.0): Decay rate + (required). + axes (iter[int >= 0]): Normalization axes, excluding batch axis + (default: all but last input axes). + only_mean (bool): Whether to normalize only with respect to mean, not variance + (default: false). + min_variance (float > 0.0): Clip variance lower than minimum + (default: 1e-4). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, decay, axes=None, only_mean=False, min_variance=1e-4, name=None, input_spec=None + ): + super().__init__(name=name, input_spec=input_spec) + + self.decay = self.submodule( + name='decay', module=decay, modules=parameter_modules, dtype='float', min_value=0.0, + max_value=1.0 + ) + + if axes is None: + self.axes = tuple(range(len(self.input_spec.shape) - 1)) + else: + self.axes = tuple(axes) + + assert not only_mean or min_variance == 1e-4 + self.only_mean = only_mean + self.min_variance = min_variance + + self.architecture_kwargs['decay'] = str(decay) + self.architecture_kwargs['axes'] = str(self.axes) + if only_mean: + self.architecture_kwargs['only_mean'] = str(only_mean) + self.architecture_kwargs['min_variance'] = str(min_variance) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + def initialize(self): + super().initialize() + + shape = (1,) + tuple( + 1 if axis in self.axes else dims for axis, dims in enumerate(self.input_spec.shape) + ) + + self.moving_mean = self.variable( + name='mean', spec=TensorSpec(type='float', shape=shape), initializer='zeros', + is_trainable=False, is_saved=True + ) + + if not self.only_mean: + self.moving_variance = self.variable( + name='variance', spec=TensorSpec(type='float', shape=shape), initializer='ones', + is_trainable=False, is_saved=True + ) + + @tf_function(num_args=1) + def apply(self, *, x, independent): + if independent or self.decay.is_constant(value=1.0): + mean = self.moving_mean + if not self.only_mean: + variance = self.moving_variance + + else: + zero = tf_util.constant(value=0, dtype='int') + one_float = tf_util.constant(value=1.0, dtype='float') + axes = (0,) + tuple(1 + axis for axis in self.axes) + + batch_size = tf_util.cast(x=tf.shape(input=x)[0], dtype='int') + is_zero_batch = tf.math.equal(x=batch_size, y=zero) + + if self.only_mean: + def true_fn(): + return self.moving_mean + + def false_fn(): + return tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) + + mean = tf.cond(pred=is_zero_batch, true_fn=true_fn, false_fn=false_fn) + + else: + def true_fn(): + return self.moving_mean, self.moving_variance + + def false_fn(): + _mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) + deviation = tf.math.squared_difference(x=x, y=_mean) + _variance = tf.reduce_mean(input_tensor=deviation, axis=axes, keepdims=True) + return _mean, _variance + + mean, variance = tf.cond(pred=is_zero_batch, true_fn=true_fn, false_fn=false_fn) + + if not self.decay.is_constant(value=0.0): + decay = self.decay.value() + batch_size = tf_util.cast(x=batch_size, dtype='float') + # Pow numerically stable since 0.0 <= decay <= 1.0 + decay = tf.math.pow(x=decay, y=batch_size) + + mean = decay * self.moving_mean + (one_float - decay) * mean + if not self.only_mean: + variance = decay * self.moving_variance + (one_float - decay) * variance + + mean = self.moving_mean.assign(value=mean) + if not self.only_mean: + variance = self.moving_variance.assign(value=variance) + + if not self.only_mean: + min_variance = tf_util.constant(value=self.min_variance, dtype='float') + reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=min_variance)) + + x = x - tf.stop_gradient(input=mean) + if not self.only_mean: + x = x * tf.stop_gradient(input=reciprocal_stddev) + + return x + + +class InstanceNormalization(Layer): + """ + Instance normalization layer (specification key: `instance_normalization`). + + Args: + axes (iter[int >= 0]): Normalization axes, excluding batch axis + (default: all input axes). + only_mean (bool): Whether to normalize only with respect to mean, not variance + (default: false). + min_variance (float > 0.0): Clip variance lower than minimum + (default: 1e-4). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, axes=None, only_mean=False, min_variance=1e-4, name=None, input_spec=None + ): + super().__init__(name=name, input_spec=input_spec) + + if axes is None: + self.axes = tuple(range(len(self.input_spec.shape))) + else: + self.axes = tuple(axes) + + assert not only_mean or min_variance == 1e-4 + self.only_mean = only_mean + self.min_variance = min_variance + + self.architecture_kwargs['axes'] = str(self.axes) + if only_mean: + self.architecture_kwargs['only_mean'] = str(only_mean) + self.architecture_kwargs['min_variance'] = str(min_variance) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + @tf_function(num_args=1) + def apply(self, *, x): + axes = tuple(1 + axis for axis in self.axes) + + if self.only_mean: + mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) + + return x - tf.stop_gradient(input=mean) + + else: + mean, variance = tf.nn.moments(x=x, axes=axes, keepdims=True) + + min_variance = tf_util.constant(value=self.min_variance, dtype='float') + reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=min_variance)) + + return (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(input=reciprocal_stddev) + + +class BatchNormalization(Layer): + """ + Batch normalization layer, generally should only be used for the agent arguments + `reward_processing[return_processing]` and `reward_processing[advantage_processing]` + (specification key: `batch_normalization`). + + Args: + axes (iter[int >= 0]): Normalization axes, excluding batch axis + (default: all but last input axes). + only_mean (bool): Whether to normalize only with respect to mean, not variance + (default: false). + min_variance (float > 0.0): Clip variance lower than minimum + (default: 1e-4). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, axes=None, only_mean=False, min_variance=1e-4, name=None, input_spec=None + ): + super().__init__(name=name, input_spec=input_spec) + + if axes is None: + self.axes = tuple(range(len(self.input_spec.shape) - 1)) + else: + self.axes = tuple(axes) + + assert not only_mean or min_variance == 1e-4 + self.only_mean = only_mean + self.min_variance = min_variance + + self.architecture_kwargs['axes'] = str(self.axes) + if only_mean: + self.architecture_kwargs['only_mean'] = str(only_mean) + self.architecture_kwargs['min_variance'] = str(min_variance) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + @tf_function(num_args=1) + def apply(self, *, x): + axes = (0,) + tuple(1 + axis for axis in self.axes) + + if self.only_mean: + mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) + + return x - tf.stop_gradient(input=mean) + + else: + mean, variance = tf.nn.moments(x=x, axes=axes, keepdims=True) + + min_variance = tf_util.constant(value=self.min_variance, dtype='float') + reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=min_variance)) + + return (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(input=reciprocal_stddev) diff --git a/tensorforce/core/layers/pooling.py b/tensorforce/core/layers/pooling.py new file mode 100644 index 000000000..5960a9dfa --- /dev/null +++ b/tensorforce/core/layers/pooling.py @@ -0,0 +1,271 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import Layer + + +class Pooling(Layer): + """ + Pooling layer (global pooling) (specification key: `pooling`). + + Args: + reduction ('concat' | 'max' | 'mean' | 'product' | 'sum'): Pooling type + (required). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, reduction, name=None, input_spec=None): + if reduction not in ('concat', 'max', 'mean', 'product', 'sum'): + raise TensorforceError.value(name='pooling', argument='reduction', value=reduction) + self.reduction = reduction + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['reduction'] = reduction + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + def output_spec(self): + output_spec = super().output_spec() + + if self.reduction == 'concat': + output_spec.shape = (output_spec.size,) + elif self.reduction in ('max', 'mean', 'product', 'sum'): + output_spec.shape = (output_spec.shape[-1],) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + if self.reduction == 'concat': + return tf.reshape(tensor=x, shape=(-1, self.output_spec().size)) + + elif self.reduction == 'max': + for _ in range(tf_util.rank(x=x) - 2): + x = tf.reduce_max(input_tensor=x, axis=1) + return x + + elif self.reduction == 'mean': + for _ in range(tf_util.rank(x=x) - 2): + x = tf.reduce_mean(input_tensor=x, axis=1) + return x + + elif self.reduction == 'product': + for _ in range(tf_util.rank(x=x) - 2): + x = tf.reduce_prod(input_tensor=x, axis=1) + return x + + elif self.reduction == 'sum': + for _ in range(tf_util.rank(x=x) - 2): + x = tf.reduce_sum(input_tensor=x, axis=1) + return x + + +class Flatten(Pooling): + """ + Flatten layer (specification key: `flatten`). + + Args: + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, name=None, input_spec=None): + super().__init__(reduction='concat', name=name, input_spec=input_spec) + + @tf_function(num_args=1) + def apply(self, *, x): + if self.input_spec.shape == (): + return tf.expand_dims(input=x, axis=1) + + else: + return super().apply(x=x) + + +class Pool1d(Layer): + """ + 1-dimensional pooling layer (local pooling) (specification key: `pool1d`). + + Args: + reduction ('average' | 'max'): Pooling type + (required). + window (int > 0): Window size + (default: 2). + stride (int > 0): Stride size + (default: 2). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, reduction, window=2, stride=2, padding='same', name=None, input_spec=None + ): + if reduction not in ('average', 'max'): + raise TensorforceError.value(name='pool1d', argument='reduction', value=reduction) + self.reduction = reduction + + if isinstance(window, int): + self.window = (1, 1, window, 1) + else: + raise TensorforceError.type(name='Pool1d', argument='window', dtype=type(window)) + + if isinstance(stride, int): + self.stride = (1, 1, stride, 1) + else: + raise TensorforceError.type(name='Pool1d', argument='stride', dtype=type(stride)) + + self.padding = padding + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['reduction'] = reduction + self.architecture_kwargs['window'] = str(window) + self.architecture_kwargs['stride'] = str(stride) + self.architecture_kwargs['padding'] = padding + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.padding == 'same': + output_spec.shape = (np.ceil(output_spec.shape[0] / self.stride[2]), output_spec.shape[1]) + elif self.padding == 'valid': + output_spec.shape = ( + np.ceil((output_spec.shape[0] - (self.window[2] - 1)) / self.stride[2]), + output_spec.shape[1] + ) + + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + x = tf.expand_dims(input=x, axis=1) + + if self.reduction == 'average': + x = tf.nn.avg_pool( + input=x, ksize=self.window, strides=self.stride, padding=self.padding.upper() + ) + + elif self.reduction == 'max': + x = tf.nn.max_pool( + input=x, ksize=self.window, strides=self.stride, padding=self.padding.upper() + ) + + x = tf.squeeze(input=x, axis=1) + + return x + + +class Pool2d(Layer): + """ + 2-dimensional pooling layer (local pooling) (specification key: `pool2d`). + + Args: + reduction ('average' | 'max'): Pooling type + (required). + window (int > 0 | (int > 0, int > 0)): Window size + (default: 2). + stride (int > 0 | (int > 0, int > 0)): Stride size + (default: 2). + padding ('same' | 'valid'): Padding type, see + `TensorFlow docs `__ + (default: 'same'). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__( + self, *, reduction, window=2, stride=2, padding='same', name=None, input_spec=None + ): + if reduction not in ('average', 'max'): + raise TensorforceError.value(name='pool1d', argument='reduction', value=reduction) + self.reduction = reduction + + if isinstance(window, int): + self.window = (1, window, window, 1) + elif len(window) == 2: + self.window = (1, window[0], window[1], 1) + else: + raise TensorforceError.type(name='Pool2d', argument='window', dtype=type(window)) + + if isinstance(stride, int): + self.stride = (1, stride, stride, 1) + elif len(window) == 2: + self.stride = (1, stride[0], stride[1], 1) + else: + raise TensorforceError.type(name='Pool2d', argument='stride', dtype=type(stride)) + + self.padding = padding + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['reduction'] = reduction + self.architecture_kwargs['window'] = str(window) + self.architecture_kwargs['stride'] = str(stride) + self.architecture_kwargs['padding'] = padding + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.padding == 'same': + output_spec.shape = ( + np.ceil(output_spec.shape[0] / self.stride[1]), + np.ceil(output_spec.shape[1] / self.stride[2]), + output_spec.shape[2] + ) + elif self.padding == 'valid': + output_spec.shape = ( + np.ceil((output_spec.shape[0] - (self.window[1] - 1)) / self.stride[1]), + np.ceil((output_spec.shape[1] - (self.window[2] - 1)) / self.stride[2]), + output_spec.shape[2] + ) + + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + if self.reduction == 'average': + x = tf.nn.avg_pool( + input=x, ksize=self.window, strides=self.stride, padding=self.padding.upper() + ) + + elif self.reduction == 'max': + x = tf.nn.max_pool( + input=x, ksize=self.window, strides=self.stride, padding=self.padding.upper() + ) + + return x diff --git a/tensorforce/core/layers/preprocessing.py b/tensorforce/core/layers/preprocessing.py new file mode 100644 index 000000000..55d74d849 --- /dev/null +++ b/tensorforce/core/layers/preprocessing.py @@ -0,0 +1,417 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import parameter_modules, SignatureDict, TensorSpec, tf_function, tf_util +from tensorforce.core.layers import Layer + + +class PreprocessingLayer(Layer): + """ + Base class for preprocessing layers which require to be reset. + + Args: + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + def input_signature(self, *, function): + if function == 'reset': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'reset': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=0) + def reset(self): + raise NotImplementedError + + +class Clipping(Layer): + """ + Clipping layer (specification key: `clipping`). + + Args: + lower (parameter, float): Lower clipping value + (default: no lower bound). + upper (parameter, float): Upper clipping value + (default: no upper bound). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, lower=None, upper=None, name=None, input_spec=None): + super().__init__(name=name, input_spec=input_spec) + + if lower is None: + assert upper is not None + self.lower = None + else: + self.lower = self.submodule( + name='lower', module=lower, modules=parameter_modules, dtype='float' + ) + + if upper is None: + assert lower is not None + self.upper = None + else: + self.upper = self.submodule( + name='upper', module=upper, modules=parameter_modules, dtype='float' + ) + + if lower is not None: + self.architecture_kwargs['lower'] = str(lower) + if upper is not None: + self.architecture_kwargs['upper'] = str(upper) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + @tf_function(num_args=1) + def apply(self, *, x): + if self.lower is None: + upper = self.upper.value() + return tf.math.minimum(x=x, y=upper) + elif self.upper is None: + lower = self.lower.value() + return tf.math.maximum(x=x, y=lower) + else: + lower = self.lower.value() + upper = self.upper.value() + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal( + x=upper, y=lower, message="Incompatible lower and upper clipping bound." + )) + with tf.control_dependencies(control_inputs=assertions): + return tf.clip_by_value(t=x, clip_value_min=lower, clip_value_max=upper) + + +class Deltafier(PreprocessingLayer): + """ + Deltafier layer computing the difference between the current and the previous input; can only + be used as preprocessing layer (specification key: `deltafier`). + + Args: + concatenate (False | int >= 0): Whether to concatenate instead of replace deltas with + input, and if so, concatenation axis + (default: false). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, concatenate=False, name=None, input_spec=None): + self.concatenate = concatenate + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['concatenate'] = str(concatenate) + + def default_input_spec(self): + return TensorSpec(type='float', shape=None) + + def output_spec(self): + output_spec = super().output_spec() + + if self.concatenate is not False: + output_spec.shape = tuple( + 2 * dims if axis == self.concatenate else dims + for axis, dims in enumerate(output_spec.shape) + ) + + return output_spec + + def initialize(self): + super().initialize() + + self.has_previous = self.variable( + name='has-previous', spec=TensorSpec(type='bool', shape=()), initializer='zeros', + is_trainable=False, is_saved=False + ) + + self.previous = self.variable( + name='previous', spec=TensorSpec(type='float', shape=((1,) + self.input_spec.shape)), + initializer='zeros', is_trainable=False, is_saved=False + ) + + @tf_function(num_args=0) + def reset(self): + false = tf_util.constant(value=False, dtype='bool') + assignment = self.has_previous.assign(value=false, read_value=False) + with tf.control_dependencies(control_inputs=(assignment,)): + return tf_util.identity(input=false) + + @tf_function(num_args=1) + def apply(self, *, x): + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_less_equal( + x=tf.shape(input=x)[0], y=1, + message="Deltafier preprocessor currently not compatible with batched Agent.act." + )) + + # TODO: hack for empty batch (for self.previous.assign below) + extended = tf.concat(values=(self.previous, x), axis=0) + + def first_delta(): + assignment = self.has_previous.assign( + value=tf_util.constant(value=True, dtype='bool'), read_value=False + ) + with tf.control_dependencies(control_inputs=(assignment,)): + return tf.concat(values=(tf.zeros_like(input=x[:1]), x[1:] - x[:-1]), axis=0) + + def later_delta(): + return x - extended[:-1] + + with tf.control_dependencies(control_inputs=assertions): + empty_batch = tf.math.equal(x=tf.shape(input=x)[0], y=0) + pred = tf.math.logical_or(x=self.has_previous, y=empty_batch) + delta = tf.cond(pred=pred, true_fn=later_delta, false_fn=first_delta) + + assignment = self.previous.assign(value=extended[-1:], read_value=False) + + with tf.control_dependencies(control_inputs=(assignment,)): + if self.concatenate is False: + return tf_util.identity(input=delta) + else: + return tf.concat(values=(x, delta), axis=(self.concatenate + 1)) + + +class Image(Layer): + """ + Image preprocessing layer (specification key: `image`). + + Args: + height (int): Height of resized image + (default: no resizing or relative to width). + width (int): Width of resized image + (default: no resizing or relative to height). + grayscale (bool | iter[float]): Turn into grayscale image, optionally using given weights + (default: false). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, height=None, width=None, grayscale=False, name=None, input_spec=None): + self.height = height + self.width = width + self.grayscale = grayscale + + super().__init__(name=name, input_spec=input_spec) + + if height is not None: + self.architecture_kwargs['height'] = str(height) + if width is not None: + self.architecture_kwargs['width'] = str(width) + if grayscale: + self.architecture_kwargs['grayscale'] = str(grayscale) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0, 0, 0)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.height is not None: + if self.width is None: + self.width = round(self.height * output_spec.shape[1] / output_spec.shape[0]) + output_spec.shape = (self.height, self.width, output_spec.shape[2]) + elif self.width is not None: + self.height = round(self.width * output_spec.shape[0] / output_spec.shape[1]) + output_spec.shape = (self.height, self.width, output_spec.shape[2]) + + if not isinstance(self.grayscale, bool) or self.grayscale: + output_spec.shape = output_spec.shape[:2] + (1,) + + return output_spec + + @tf_function(num_args=1) + def apply(self, *, x): + if self.height is not None: + x = tf.image.resize(images=x, size=(self.height, self.width)) + + if not isinstance(self.grayscale, bool): + weights = tf_util.constant( + value=self.grayscale, dtype='float', shape=(1, 1, 1, len(self.grayscale)) + ) + x = tf.reduce_sum(input_tensor=(x * weights), axis=3, keepdims=True) + elif self.grayscale: + x = tf.image.rgb_to_grayscale(images=x) + + return x + + +class Sequence(PreprocessingLayer): + """ + Sequence layer stacking the current and previous inputs; can only be used as preprocessing + layer (specification key: `sequence`). + + Args: + length (int > 0): Number of inputs to concatenate + (required). + axis (int >= 0): Concatenation axis, excluding batch axis + (default: last axis). + concatenate (bool): Whether to concatenate inputs at given axis, otherwise introduce new + sequence axis + (default: true). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + """ + + def __init__(self, *, length, axis=-1, concatenate=True, name=None, input_spec=None): + assert length > 1 + self.length = length + self.axis = axis + self.concatenate = concatenate + + super().__init__(name=name, input_spec=input_spec) + + self.architecture_kwargs['length'] = str(length) + self.architecture_kwargs['axis'] = str(axis) + self.architecture_kwargs['concatenate'] = str(concatenate) + + def output_spec(self): + output_spec = super().output_spec() + + if self.concatenate: + if self.axis == -1: + self.axis = len(output_spec.shape) - 1 + output_spec.shape = tuple( + self.length * dims if axis == self.axis else dims + for axis, dims in enumerate(output_spec.shape) + ) + + else: + if self.axis == -1: + self.axis = len(output_spec.shape) + shape = output_spec.shape + output_spec.shape = shape[:self.axis] + (self.length,) + shape[self.axis:] + + return output_spec + + def initialize(self): + super().initialize() + + self.has_previous = self.variable( + name='has-previous', spec=TensorSpec(type='bool', shape=()), initializer='zeros', + is_trainable=False, is_saved=False + ) + + shape = self.input_spec.shape + if self.concatenate: + shape = (1,) + shape[:self.axis] + (shape[self.axis] * (self.length - 1),) + \ + shape[self.axis + 1:] + else: + shape = (1,) + shape[:self.axis] + (self.length - 1,) + shape[self.axis:] + self.previous = self.variable( + name='previous', spec=TensorSpec(type='float', shape=shape), initializer='zeros', + is_trainable=False, is_saved=False + ) + + @tf_function(num_args=0) + def reset(self): + false = tf_util.constant(value=False, dtype='bool') + assignment = self.has_previous.assign(value=false, read_value=False) + with tf.control_dependencies(control_inputs=(assignment,)): + return tf_util.identity(input=false) + + @tf_function(num_args=1) + def apply(self, *, x): + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_less_equal( + x=tf.shape(input=x)[0], y=1, + message="Sequence preprocessor currently not compatible with batched Agent.act." + )) + + with tf.control_dependencies(control_inputs=assertions): + + def empty_batch(): + if self.concatenate: + current = x + else: + current = tf.expand_dims(input=x, axis=(self.axis + 1)) + multiples = tuple( + self.length if dims == self.axis + 1 else 1 + for dims in range(self.output_spec().rank + 1) + ) + return tf.tile(input=current, multiples=multiples) + + def not_empty_batch(): + + def first_timestep(): + assignment = self.has_previous.assign( + value=tf_util.constant(value=True, dtype='bool'), read_value=False + ) + with tf.control_dependencies(control_inputs=(assignment,)): + if self.concatenate: + current = x + else: + current = tf.expand_dims(input=x, axis=(self.axis + 1)) + multiples = tuple( + self.length if dims == self.axis + 1 else 1 + for dims in range(self.output_spec().rank + 1) + ) + return tf.tile(input=current, multiples=multiples) + + def other_timesteps(): + if self.concatenate: + current = x + else: + current = tf.expand_dims(input=x, axis=(self.axis + 1)) + return tf.concat(values=(self.previous, current), axis=(self.axis + 1)) + + xs = tf.cond( + pred=self.has_previous, true_fn=other_timesteps, false_fn=first_timestep + ) + + if self.concatenate: + begin = tuple( + self.input_spec.shape[dims - 1] if dims == self.axis + 1 else 0 + for dims in range(self.output_spec().rank + 1) + ) + else: + begin = tuple( + 1 if dims == self.axis + 1 else 0 + for dims in range(self.output_spec().rank + 1) + ) + assignment = self.previous.assign( + value=tf.slice(input_=xs, begin=begin, size=self.previous.shape), + read_value=False + ) + + with tf.control_dependencies(control_inputs=(assignment,)): + return tf_util.identity(input=xs) + + is_empty_batch = tf.math.equal(x=tf.shape(input=x)[0], y=0) + return tf.cond(pred=is_empty_batch, true_fn=empty_batch, false_fn=not_empty_batch) diff --git a/tensorforce/core/layers/rnn.py b/tensorforce/core/layers/rnn.py new file mode 100644 index 000000000..5aa410075 --- /dev/null +++ b/tensorforce/core/layers/rnn.py @@ -0,0 +1,260 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError, util +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.layers import TemporalLayer, TransformationBase + + +class Rnn(TemporalLayer, TransformationBase): + """ + Recurrent neural network layer which is unrolled over the sequence of timesteps (per episode), + that is, the RNN cell is applied to the layer input at each timestep and the RNN consequently + maintains a temporal internal state over the course of an episode (specification key: `rnn`). + + Args: + cell ('gru' | 'lstm'): The recurrent cell type + (required). + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + horizon (parameter, int >= 0): Past horizon, for truncated backpropagation through time + (required). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras RNN cell layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, cell, size, horizon, bias=True, activation='tanh', dropout=0.0, + vars_trainable=True, l2_regularization=None, name=None, input_spec=None, **kwargs + ): + if bias: + # Hack for TransformationBase to avoid name clash with Keras variable name + bias = '_bias' + + super().__init__( + temporal_processing='iterative', horizon=horizon, size=size, bias=bias, + activation=activation, dropout=dropout, vars_trainable=vars_trainable, + l2_regularization=l2_regularization, name=name, input_spec=input_spec + ) + + self.cell_type = cell + if self.cell_type == 'gru': + self.cell = tf.keras.layers.GRUCell( + units=self.size, name='cell', **kwargs # , dtype=tf_util.get_dtype(type='float') + ) + elif self.cell_type == 'lstm': + self.cell = tf.keras.layers.LSTMCell( + units=self.size, name='cell', **kwargs # , dtype=tf_util.get_dtype(type='float') + ) + else: + raise TensorforceError.value( + name='Rnn', argument='cell', value=self.cell_type, hint='not in {gru,lstm}' + ) + + self.architecture_kwargs['cell'] = cell + self.architecture_kwargs['size'] = str(size) + self.architecture_kwargs['horizon'] = str(horizon) + self.architecture_kwargs['bias'] = str(bias) + if activation is not None: + self.architecture_kwargs['activation'] = str(activation) + if dropout != 0.0: + self.architecture_kwargs['dropout'] = str(dropout) + if not vars_trainable: + self.architecture_kwargs['trainable'] = str(vars_trainable) + if l2_regularization is not None: + self.architecture_kwargs['l2_regularization'] = str(l2_regularization) + + def default_input_spec(self): + return TensorSpec(type='float', shape=(0,)) + + def output_spec(self): + output_spec = super().output_spec() + + if self.squeeze: + output_spec.shape = output_spec.shape[:-1] + else: + output_spec.shape = output_spec.shape[:-1] + (self.size,) + + output_spec.min_value = None + output_spec.max_value = None + + return output_spec + + @property + def internals_spec(self): + internals_spec = super().internals_spec + + if self.cell_type == 'gru': + shape = (self.size,) + elif self.cell_type == 'lstm': + shape = (2, self.size) + + internals_spec['state'] = TensorSpec(type='float', shape=shape) + + return internals_spec + + def internals_init(self): + internals_init = super().internals_init() + + if self.cell_type == 'gru': + shape = (self.size,) + elif self.cell_type == 'lstm': + shape = (2, self.size) + + stddev = min(0.1, np.sqrt(2.0 / self.size)) + internals_init['state'] = np.random.normal(scale=stddev, size=shape).astype( + util.np_dtype(dtype='float') + ) + + return internals_init + + def initialize(self): + super().initialize() + + self.cell.build(input_shape=self.input_spec.shape[0]) + + @tf_function(num_args=0) + def regularize(self): + regularization_loss = super().regularize() + + if len(self.cell.losses) > 0: + regularization_loss += tf.math.add_n(inputs=self.cell.losses) + + return regularization_loss + + @tf_function(num_args=3) + def apply(self, *, x, horizons, internals): + x, internals = TemporalLayer.apply(self=self, x=x, horizons=horizons, internals=internals) + x = TransformationBase.apply(self=self, x=x) + return x, internals + + @tf_function(num_args=2) + def iterative_apply(self, *, x, internals): + x = tf_util.float32(x=x) + state = tf_util.float32(x=internals['state']) + + if self.cell_type == 'gru': + state = (state,) + elif self.cell_type == 'lstm': + state = (state[:, 0, :], state[:, 1, :]) + + x, state = self.cell(inputs=x, states=state) + + if self.cell_type == 'gru': + state = state[0] + elif self.cell_type == 'lstm': + state = tf.stack(values=state, axis=1) + + x = tf_util.cast(x=x, dtype='float') + internals['state'] = tf_util.cast(x=state, dtype='float') + + return x, internals + + +class Gru(Rnn): + """ + Gated recurrent unit layer which is unrolled over the sequence of timesteps (per episode), that + is, the GRU cell is applied to the layer input at each timestep and the GRU consequently + maintains a temporal internal state over the course of an episode (specification key: `gru`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + horizon (parameter, int >= 0): Past horizon, for truncated backpropagation through time + (required). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras GRU layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, size, horizon, bias=True, activation='tanh', dropout=0.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None, **kwargs + ): + super().__init__( + cell='gru', size=size, horizon=horizon, bias=bias, activation=activation, + dropout=dropout, vars_trainable=vars_trainable, l2_regularization=l2_regularization, + name=name, input_spec=input_spec, **kwargs + ) + + +class Lstm(Rnn): + """ + Long short-term memory layer which is unrolled over the sequence of timesteps (per episode), + that is, the LSTM cell is applied to the layer input at each timestep and the LSTM consequently + maintains a temporal internal state over the course of an episode (specification key: `lstm`). + + Args: + size (int >= 0): Layer output size, 0 implies additionally removing the axis + (required). + horizon (parameter, int >= 0): Past horizon, for truncated backpropagation through time + (required). + bias (bool): Whether to add a trainable bias variable + (default: true). + activation ('crelu' | 'elu' | 'leaky-relu' | 'none' | 'relu' | 'selu' | 'sigmoid' | + 'softmax' | 'softplus' | 'softsign' | 'swish' | 'tanh'): Activation nonlinearity + (default: tanh). + dropout (parameter, 0.0 <= float < 1.0): Dropout rate + (default: 0.0). + vars_trainable (bool): Whether layer variables are trainable + (default: true). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): Layer name + (default: internally chosen). + input_spec (specification): internal use. + kwargs: Additional arguments for Keras LSTM layer, see + `TensorFlow docs `__. + """ + + def __init__( + self, *, size, horizon, bias=True, activation='tanh', dropout=0.0, vars_trainable=True, + l2_regularization=None, name=None, input_spec=None, **kwargs + ): + super().__init__( + cell='lstm', size=size, horizon=horizon, bias=bias, activation=activation, + dropout=dropout, vars_trainable=vars_trainable, l2_regularization=l2_regularization, + name=name, input_spec=input_spec, **kwargs + ) diff --git a/tensorforce/core/memories/__init__.py b/tensorforce/core/memories/__init__.py index 5c1ba9c8e..7689305df 100755 --- a/tensorforce/core/memories/__init__.py +++ b/tensorforce/core/memories/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,23 +15,13 @@ from tensorforce.core.memories.memory import Memory from tensorforce.core.memories.queue import Queue -from tensorforce.core.memories.latest import Latest + +# from tensorforce.core.memories.prioritized_replay import PrioritizedReplay +from tensorforce.core.memories.recent import Recent from tensorforce.core.memories.replay import Replay -from tensorforce.core.memories.prioritized_replay import PrioritizedReplay -memories = dict( - latest=Latest, - replay=Replay, - prioritized_replay=PrioritizedReplay -) +memory_modules = dict(default=Replay, recent=Recent, replay=Replay) -__all__ = [ - 'memories', - 'Memory', - 'Queue', - 'Latest', - 'Replay', - 'PrioritizedReplay' -] +__all__ = ['Memory', 'memory_modules', 'Queue', 'Recent', 'Replay'] diff --git a/tensorforce/core/memories/latest.py b/tensorforce/core/memories/latest.py deleted file mode 100755 index 318364ced..000000000 --- a/tensorforce/core/memories/latest.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce.core.memories import Queue - - -class Latest(Queue): - """ - Memory which always retrieves most recent experiences. - """ - - def __init__(self, states, internals, actions, include_next_states, capacity, scope='latest', summary_labels=None): - """ - Latest memory. - - Args: - states: States specifiction. - internals: Internal states specification. - actions: Actions specification. - include_next_states: Include subsequent state if true. - capacity: Memory capacity. - """ - super(Latest, self).__init__( - states=states, - internals=internals, - actions=actions, - include_next_states=include_next_states, - capacity=capacity, - scope=scope, - summary_labels=summary_labels - ) - - def tf_retrieve_timesteps(self, n): - num_timesteps = (self.memory_index - self.episode_indices[0] - 2) % self.capacity + 1 - n = tf.minimum(x=n, y=num_timesteps) - indices = tf.range( - start=(self.memory_index - 1 - n), - limit=(self.memory_index - 1) - ) % self.capacity - terminal = tf.gather(params=self.terminal_memory, indices=indices) - indices = tf.boolean_mask(tensor=indices, mask=tf.logical_not(x=terminal)) - return self.retrieve_indices(indices=indices) - - def tf_retrieve_episodes(self, n): - n = tf.minimum(x=n, y=self.episode_count) - start = self.episode_indices[self.episode_count - n - 1] + 1 - limit = self.episode_indices[self.episode_count - 1] - limit += tf.where(condition=(start < limit), x=0, y=self.capacity) - indices = tf.range(start=start, limit=limit) % self.capacity - return self.retrieve_indices(indices=indices) - - def tf_retrieve_sequences(self, n, sequence_length): - num_sequences = (self.memory_index - self.episode_indices[0] - 2 - sequence_length + 1) % self.capacity + 1 - n = tf.minimum(x=n, y=num_sequences) - indices = tf.range( - start=(self.memory_index - 1 - n - sequence_length), # or '- 1' implied in sequence length? - limit=(self.memory_index - 1) - ) % self.capacity - # sequence_indices = [tf.range(start=indices[n], limit=(indices[n] + sequence_length)) for k in range(n)] - # sequence_indices = [indices[k: k + sequence_length] for k in tf.unstack(value=tf.range(start=0, limit=n), num=n)] - sequence_indices = tf.expand_dims(input=tf.range(start=0, limit=n), axis=1) + tf.expand_dims(input=tf.constant(value=list(range(sequence_length))), axis=0) - sequence_indices = tf.reshape(tensor=sequence_indices, shape=(n * sequence_length,)) - # sequence_indices = tf.concat(values=sequence_indices, axis=0) # tf.stack !!!!! - terminal = tf.gather(params=self.terminal_memory, indices=indices) - sequence_indices = tf.boolean_mask(tensor=sequence_indices, mask=tf.logical_not(x=terminal)) - return self.retrieve_indices(indices=sequence_indices) diff --git a/tensorforce/core/memories/memory.py b/tensorforce/core/memories/memory.py index 4aeedccf5..4d6fe4a29 100755 --- a/tensorforce/core/memories/memory.py +++ b/tensorforce/core/memories/memory.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,171 +13,161 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +from tensorforce.core import Module, SignatureDict, TensorSpec, tf_function -import tensorflow as tf -from tensorforce import util -import tensorforce.core.memories - - -class Memory(object): +class Memory(Module): """ Base class for memories. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + values_spec (specification): internal use. + min_capacity (int >= 0): internal use. """ - def __init__(self, states, internals, actions, include_next_states, scope='memory', summary_labels=None): - """ - Memory. - - Args: - states: States specifiction. - internals: Internal states specification. - actions: Actions specification. - include_next_states: Include subsequent state if true. - """ - self.states_spec = states - self.internals_spec = internals - self.actions_spec = actions - self.include_next_states = include_next_states - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - assert not kwargs.get('trainable', False) - self.variables[name] = variable - return variable - - self.initialize = tf.make_template( - name_=(scope + '/initialize'), - func_=self.tf_initialize, - custom_getter_=custom_getter - ) - self.store = tf.make_template( - name_=(scope + '/store'), - func_=self.tf_store, - custom_getter_=custom_getter - ) - self.retrieve_timesteps = tf.make_template( - name_=(scope + '/retrieve_timesteps'), - func_=self.tf_retrieve_timesteps, - custom_getter_=custom_getter - ) - self.retrieve_episodes = tf.make_template( - name_=(scope + '/retrieve_episodes'), - func_=self.tf_retrieve_episodes, - custom_getter_=custom_getter - ) - self.retrieve_sequences = tf.make_template( - name_=(scope + '/retrieve_sequences'), - func_=self.tf_retrieve_sequences, - custom_getter_=custom_getter - ) - self.update_batch = tf.make_template( - name_=(scope + '/update_batch'), - func_=self.tf_update_batch, - custom_getter_=custom_getter - ) - - def tf_initialize(self): - """ - Initializes memory. - """ + def __init__( + self, *, device=None, l2_regularization=None, name=None, values_spec=None, min_capacity=None + ): + super().__init__(device=device, l2_regularization=l2_regularization, name=name) + + self.values_spec = values_spec + self.min_capacity = min_capacity + + def input_signature(self, *, function): + if function == 'enqueue': + return self.values_spec.signature(batched=True) + + elif function == 'predecessors': + return SignatureDict( + indices=TensorSpec(type='int', shape=()).signature(batched=True), + horizon=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + elif function == 'reset': + return SignatureDict() + + elif function == 'retrieve': + return SignatureDict(indices=TensorSpec(type='int', shape=()).signature(batched=True)) + + elif function == 'retrieve_episodes': + return SignatureDict(n=TensorSpec(type='int', shape=()).signature(batched=False)) + + elif function == 'retrieve_timesteps': + return SignatureDict( + n=TensorSpec(type='int', shape=()).signature(batched=False), + past_horizon=TensorSpec(type='int', shape=()).signature(batched=False), + future_horizon=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + elif function == 'successors': + return SignatureDict( + indices=TensorSpec(type='int', shape=()).signature(batched=True), + horizon=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'enqueue': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'predecessors': + def get_output_signature(sequence_values, initial_values): + if len(sequence_values) == 0: + if len(initial_values) == 0: + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=True) + ) + else: + return SignatureDict( + lengths=TensorSpec(type='int', shape=()).signature(batched=True), + initial_values=self.values_spec[initial_values].signature(batched=True) + ) + elif len(initial_values) == 0: + return SignatureDict( + starts_lengths=TensorSpec(type='int', shape=(2,)).signature(batched=True), + sequence_values=self.values_spec[sequence_values].signature(batched=True) + ) + else: + return SignatureDict( + starts_lengths=TensorSpec(type='int', shape=(2,)).signature(batched=True), + sequence_values=self.values_spec[sequence_values].signature(batched=True), + initial_values=self.values_spec[initial_values].signature(batched=True) + ) + return get_output_signature + + elif function == 'reset': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'retrieve': + def get_output_signature(values): + return SignatureDict(singleton=self.values_spec[values].signature(batched=True)) + return get_output_signature + + elif function == 'retrieve_episodes': + return SignatureDict(singleton=TensorSpec(type='int', shape=()).signature(batched=True)) + + elif function == 'retrieve_timesteps': + return SignatureDict(singleton=TensorSpec(type='int', shape=()).signature(batched=True)) + + elif function == 'successors': + def get_output_signature(sequence_values, final_values): + if len(sequence_values) == 0: + if len(final_values) == 0: + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=True) + ) + else: + return SignatureDict( + lengths=TensorSpec(type='int', shape=()).signature(batched=True), + final_values=self.values_spec[final_values].signature(batched=True) + ) + elif len(final_values) == 0: + return SignatureDict( + starts_lengths=TensorSpec(type='int', shape=(2,)).signature(batched=True), + sequence_values=self.values_spec[sequence_values].signature(batched=True) + ) + else: + return SignatureDict( + starts_lengths=TensorSpec(type='int', shape=(2,)).signature(batched=True), + sequence_values=self.values_spec[sequence_values].signature(batched=True), + final_values=self.values_spec[final_values].signature(batched=True) + ) + return get_output_signature + + else: + return super().output_signature(function=function) + + @tf_function(num_args=6) + def enqueue(self, *, states, internals, auxiliaries, actions, terminal, reward): raise NotImplementedError - def tf_store(self, states, internals, actions, terminal, reward): - """" - Stores experiences, i.e. a batch of timesteps. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - """ + @tf_function(num_args=1) + def retrieve(self, *, indices, values): raise NotImplementedError - def tf_retrieve_timesteps(self, n): - """ - Retrieves a given number of timesteps from the stored experiences. - - Args: - n: Number of timesteps to retrieve. - - Returns: - Dicts containing the retrieved experiences. - """ + @tf_function(num_args=2) + def successors(self, *, indices, horizon, sequence_values, final_values): raise NotImplementedError - def tf_retrieve_episodes(self, n): - """ - Retrieves a given number of episodes from the stored experiences. - - Args: - n: Number of episodes to retrieve. - - Returns: - Dicts containing the retrieved experiences. - """ + @tf_function(num_args=2) + def predecessors(self, *, indices, horizon, sequence_values, initial_values): raise NotImplementedError - def tf_retrieve_sequences(self, n, sequence_length): - """ - Retrieves a given number of temporally consistent timestep sequences from the stored - experiences. - - Args: - n: Number of sequences to retrieve. - sequence_length: Length of timestep sequences. - - Returns: - Dicts containing the retrieved experiences. - """ + @tf_function(num_args=3) + def retrieve_timesteps(self, *, n, past_horizon, future_horizon): raise NotImplementedError - def tf_update_batch(self, loss_per_instance): - """ - Updates the internal information of the latest batch instances based on their loss. - - Args: - loss_per_instance: Loss per instance tensor. - """ - pass - - def get_variables(self): - """ - Returns the TensorFlow variables used by the memory. - - Returns: - List of variables. - """ - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the memory. - - Returns: - List of summaries. - """ - return self.summaries - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a memory from a specification dict. - """ - memory = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.memories.memories, - kwargs=kwargs - ) - assert isinstance(memory, Memory) - return memory + @tf_function(num_args=1) + def retrieve_episodes(self, *, n): + raise NotImplementedError diff --git a/tensorforce/core/memories/old_naive_prioritized_replay.py b/tensorforce/core/memories/old_naive_prioritized_replay.py deleted file mode 100755 index 002318d2e..000000000 --- a/tensorforce/core/memories/old_naive_prioritized_replay.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from random import random, randrange -from six.moves import xrange -import numpy as np - -from tensorforce import util, TensorForceError -from tensorforce.core.memories import Memory - - -#TODO implement in TF -class NaivePrioritizedReplay(Memory): - """ - Prioritised replay sampling based on loss per experience. - """ - - def __init__(self, states_spec, actions_spec, capacity, prioritization_weight=1.0): - super(NaivePrioritizedReplay, self).__init__(states_spec=states_spec, actions_spec=actions_spec) - self.capacity = capacity - self.prioritization_weight = prioritization_weight - self.internals_config = None - # Stores (priority, observation) pairs in reverse priority order. - self.observations = list() - self.none_priority_index = 0 - self.batch_indices = None - # Stores last observation until next_state value is known. - self.last_observation = None - - def add_observation(self, states, internals, actions, terminal, reward): - if self.internals_config is None and internals is not None: - self.internals_config = [(internal.shape, internal.dtype) for internal in internals] - - if self.last_observation is not None: - observation = self.last_observation + (states, internals) - - if len(self.observations) < self.capacity: - self.observations.append((None, observation)) - elif self.none_priority_index > 0: - priority, _ = self.observations.pop(self.none_priority_index - 1) - self.observations.append((None, observation)) - self.none_priority_index -= 1 - else: - raise TensorForceError("Memory contains only unseen observations.") - - self.last_observation = (states, internals, actions, terminal, reward) - - def get_batch(self, batch_size, next_states=False): - """ - Samples a batch of the specified size according to priority. - - Args: - batch_size: The batch size - next_states: A boolean flag indicating whether 'next_states' values should be included - - Returns: A dict containing states, actions, rewards, terminals, internal states (and next states) - - """ - if batch_size > len(self.observations): - raise TensorForceError("Batch size is larger than number of observations in memory.") - - states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()} - internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config] - actions = {name: np.zeros((batch_size,) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in self.actions_spec.items()} - terminal = np.zeros((batch_size,), dtype=util.np_dtype('bool')) - reward = np.zeros((batch_size,), dtype=util.np_dtype('float')) - if next_states: - next_states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()} - next_internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config] - - self.batch_indices = list() - not_sampled_index = self.none_priority_index - sum_priorities = sum(priority for priority, _ in self.observations if priority is not None) - for n in xrange(batch_size): - if not_sampled_index < len(self.observations): - _, observation = self.observations[not_sampled_index] - index = not_sampled_index - not_sampled_index += 1 - elif sum_priorities / self.capacity < util.epsilon: - index = randrange(self.none_priority_index) - while index in self.batch_indices: - index = randrange(self.none_priority_index) - _, observation = self.observations[index] - else: - while True: - sample = random() - for index, (priority, observation) in enumerate(self.observations): - sample -= priority / sum_priorities - if sample < 0.0 or index >= self.none_priority_index: - break - if index not in self.batch_indices: - break - - for name, state in states.items(): - state[n] = observation[0][name] - for k, internal in enumerate(internals): - internal[n] = observation[1][k] - for name, action in actions.items(): - action[n] = observation[2][name] - terminal[n] = observation[3] - reward[n] = observation[4] - if next_states: - for name, next_state in next_states.items(): - next_state[n] = observation[5][name] - for k, next_internal in enumerate(next_internals): - next_internal[n] = observation[6][k] - self.batch_indices.append(index) - - if next_states: - return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals) - else: - return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward) - - def update_batch(self, loss_per_instance): - """ - Computes priorities according to loss. - - Args: - loss_per_instance: - - """ - if self.batch_indices is None: - raise TensorForceError("Need to call get_batch before each update_batch call.") - # if len(loss_per_instance) != len(self.batch_indices): - # raise TensorForceError("For all instances a loss value has to be provided.") - - updated = list() - for index, loss in zip(self.batch_indices, loss_per_instance): - priority, observation = self.observations[index] - updated.append((loss ** self.prioritization_weight, observation)) - for index in sorted(self.batch_indices, reverse=True): - priority, _ = self.observations.pop(index) - self.none_priority_index -= (priority is not None) - self.batch_indices = None - updated = sorted(updated, key=(lambda x: x[0])) - - update_priority, update_observation = updated.pop() - index = -1 - for priority, _ in iter(self.observations): - index += 1 - if index == self.none_priority_index: - break - if update_priority < priority: - continue - self.observations.insert(index, (update_priority, update_observation)) - index += 1 - self.none_priority_index += 1 - if not updated: - break - update_priority, update_observation = updated.pop() - else: - self.observations.insert(index, (update_priority, update_observation)) - self.none_priority_index += 1 - while updated: - self.observations.insert(index, updated.pop()) - self.none_priority_index += 1 diff --git a/tensorforce/core/memories/old_prioritized_replay.py b/tensorforce/core/memories/old_prioritized_replay.py deleted file mode 100644 index d310b1b29..000000000 --- a/tensorforce/core/memories/old_prioritized_replay.py +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import random -from six.moves import xrange -import numpy as np -from collections import namedtuple - -from tensorforce import util, TensorForceError -from tensorforce.core.memories import Memory - -_SumRow = namedtuple('SumRow', ['item', 'priority']) - -#TODO move to util/rename methods to conform to naming scheme -class SumTree(object): - """ - Sum tree data structure where data is stored in leaves and each node on the - tree contains a sum of the children. - - Items and priorities are stored in leaf nodes, while internal nodes store - the sum of priorities from all its descendants. Internally a single list - stores the internal nodes followed by leaf nodes. - - See: - - [Binary heap trees](https://en.wikipedia.org/wiki/Binary_heap) - - [Section B.2.1 in the prioritized replay paper](https://arxiv.org/pdf/1511.05952.pdf) - - [The CNTK implementation](https://github.com/Microsoft/CNTK/blob/258fbec7600fe525b50c3e12d4df0c971a42b96a/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py) - - Usage: - tree = SumTree(100) - tree.push('item1', priority=0.5) - tree.push('item2', priority=0.6) - item, priority = tree[0] - batch = tree.sample_minibatch(2) - """ - - def __init__(self, capacity): - self._capacity = capacity - - # Initializes all internal nodes to have value 0. - self._memory = [0] * (capacity - 1) - self._position = 0 - self._actual_capacity = 2 * self._capacity - 1 - - def put(self, item, priority=None): - """ - Stores a transition in replay memory. - - If the memory is full, the oldest entry is replaced. - """ - if not self._isfull(): - self._memory.append(None) - position = self._next_position_then_increment() - old_priority = 0 if self._memory[position] is None \ - else (self._memory[position].priority or 0) - row = _SumRow(item, priority) - self._memory[position] = row - self._update_internal_nodes( - position, (row.priority or 0) - old_priority) - - def move(self, external_index, new_priority): - """ - Change the priority of a leaf node - """ - index = external_index + (self._capacity - 1) - return self._move(index, new_priority) - - def _move(self, index, new_priority): - """ - Change the priority of a leaf node. - """ - item, old_priority = self._memory[index] - old_priority = old_priority or 0 - self._memory[index] = _SumRow(item, new_priority) - self._update_internal_nodes(index, new_priority - old_priority) - - def _update_internal_nodes(self, index, delta): - """ - Update internal priority sums when leaf priority has been changed. - Args: - index: leaf node index - delta: change in priority - """ - # Move up tree, increasing position, updating sum - while index > 0: - index = (index - 1) // 2 - self._memory[index] += delta - - def _isfull(self): - return len(self) == self._capacity - - def _next_position_then_increment(self): - """ - Similar to position++. - """ - start = self._capacity - 1 - position = start + self._position - self._position = (self._position + 1) % self._capacity - return position - - def _sample_with_priority(self, p): - """ - Sample random element with priority greater than p. - """ - parent = 0 - while True: - left = 2 * parent + 1 - if left >= len(self._memory): - # parent points to a leaf node already. - return parent - - left_p = self._memory[left] if left < self._capacity - 1 \ - else (self._memory[left].priority or 0) - if p <= left_p: - parent = left - else: - if left + 1 >= len(self._memory): - raise RuntimeError('Right child is expected to exist.') - p -= left_p - parent = left + 1 - - def sample_minibatch(self, batch_size): - """ - Sample minibatch of size batch_size. - """ - pool_size = len(self) - if pool_size == 0: - return [] - - delta_p = self._memory[0] / batch_size - chosen_idx = [] - # if all priorities sum to ~0 choose randomly otherwise random sample - if abs(self._memory[0]) < util.epsilon: - chosen_idx = np.random.randint(self._capacity - 1, self._capacity - 1 + len(self), size=batch_size).tolist() - else: - for i in xrange(batch_size): - lower = max(i * delta_p, 0) - upper = min((i + 1) * delta_p, self._memory[0]) - p = random.uniform(lower, upper) - chosen_idx.append(self._sample_with_priority(p)) - return [(i, self._memory[i]) for i in chosen_idx] - - def __len__(self): - """ - Return the current number of transitions. - """ - return len(self._memory) - (self._capacity - 1) - - def __getitem__(self, index): - return self._memory[self._capacity - 1:][index] - - def __getslice__(self, start, end): - self.memory[self._capacity - 1:][start:end] - - -#TODO implement in TF -class PrioritizedReplay(Memory): - """ - Prioritised replay sampling based on loss per experience. - """ - - def __init__(self, states_spec, actions_spec, capacity, prioritization_weight=1.0, prioritization_constant=0.0): - super(PrioritizedReplay, self).__init__(states_spec=states_spec, actions_spec=actions_spec) - self.capacity = capacity - self.prioritization_weight = prioritization_weight - self.prioritization_constant = prioritization_constant - self.internals_spec = None - self.batch_indices = None - - # Stores (priority, observation) pairs - self.observations = SumTree(capacity) - - # Queue index where seen observations end and unseen ones begin. - self.none_priority_index = 0 - - # Stores last observation until next_state value is known. - self.last_observation = None - - def add_observation(self, states, internals, actions, terminal, reward): - if self.internals_spec is None and internals is not None: - self.internals_spec = [(internal.shape, internal.dtype) for internal in internals] - - if self.last_observation is not None: - observation = self.last_observation + (states, internals) - - # We are above capacity and have some seen observations - if self.observations._isfull(): - if self.none_priority_index <= 0: - raise TensorForceError( - "Trying to replace unseen observations: " - "Memory is at capacity and contains only unseen observations." - ) - self.none_priority_index -= 1 - - self.observations.put(observation, None) - - self.last_observation = (states, internals, actions, terminal, reward) - - def get_batch(self, batch_size, next_states=False): - """ - Samples a batch of the specified size according to priority. - - Args: - batch_size: The batch size - next_states: A boolean flag indicating whether 'next_states' values should be included - - Returns: A dict containing states, actions, rewards, terminals, internal states (and next states) - - """ - if batch_size > len(self.observations): - raise TensorForceError( - "Requested batch size is larger than observations in memory: increase config.first_update.") - - # Init empty states - states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype( - state['type'])) for name, state in self.states_spec.items()} - internals = [np.zeros((batch_size,) + shape, dtype) - for shape, dtype in self.internals_spec] - actions = {name: np.zeros((batch_size,) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in self.actions_spec.items()} - terminal = np.zeros((batch_size,), dtype=util.np_dtype('bool')) - reward = np.zeros((batch_size,), dtype=util.np_dtype('float')) - if next_states: - next_states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype( - state['type'])) for name, state in self.states_spec.items()} - next_internals = [np.zeros((batch_size,) + shape, dtype) - for shape, dtype in self.internals_spec] - - # Start with unseen observations - unseen_indices = list(xrange( - self.none_priority_index + self.observations._capacity - 1, - len(self.observations) + self.observations._capacity - 1) - ) - self.batch_indices = unseen_indices[:batch_size] - - # Get remaining observations using weighted sampling - remaining = batch_size - len(self.batch_indices) - if remaining: - samples = self.observations.sample_minibatch(remaining) - sample_indices = [i for i, o in samples] - self.batch_indices += sample_indices - - # Shuffle - np.random.shuffle(self.batch_indices) - - # Collect observations - for n, index in enumerate(self.batch_indices): - observation, _ = self.observations._memory[index] - - for name, state in states.items(): - state[n] = observation[0][name] - for k, internal in enumerate(internals): - internal[n] = observation[1][k] - for name, action in actions.items(): - action[n] = observation[2][name] - terminal[n] = observation[3] - reward[n] = observation[4] - if next_states: - for name, next_state in next_states.items(): - next_state[n] = observation[5][name] - for k, next_internal in enumerate(next_internals): - next_internal[n] = observation[6][k] - - if next_states: - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - else: - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - def update_batch(self, loss_per_instance): - """ - Computes priorities according to loss. - - Args: - loss_per_instance: - - """ - if self.batch_indices is None: - raise TensorForceError("Need to call get_batch before each update_batch call.") - # if len(loss_per_instance) != len(self.batch_indices): - # raise TensorForceError("For all instances a loss value has to be provided.") - - for index, loss in zip(self.batch_indices, loss_per_instance): - # Sampling priority is proportional to the largest absolute temporal difference error. - new_priority = (np.abs(loss) + self.prioritization_constant) ** self.prioritization_weight - self.observations._move(index, new_priority) - self.none_priority_index += 1 diff --git a/tensorforce/core/memories/prioritized_replay.py b/tensorforce/core/memories/prioritized_replay.py deleted file mode 100644 index 527d9108a..000000000 --- a/tensorforce/core/memories/prioritized_replay.py +++ /dev/null @@ -1,449 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf -from tensorforce import util -from tensorforce.core.memories import Memory - - -class PrioritizedReplay(Memory): - """ - Memory organized as a priority queue, which randomly retrieves experiences sampled according - their priority values. - """ - - def __init__( - self, - states, - internals, - actions, - include_next_states, - capacity, - prioritization_weight=1.0, - buffer_size=100, - scope='queue', - summary_labels=None - ): - """ - Prioritized queue memory. - - Args: - states: States specifiction. - internals: Internal states specification. - actions: Actions specification. - include_next_states: Include subsequent state if true. - capacity: Memory capacity. - prioritization_weight: Prioritization weight. - buffer_size: Buffer size. - """ - super(PrioritizedReplay, self).__init__( - states=states, - internals=internals, - actions=actions, - include_next_states=include_next_states, - scope=scope, - summary_labels=summary_labels - ) - self.capacity = capacity - self.buffer_size = buffer_size - self.prioritization_weight = prioritization_weight - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - assert not kwargs.get('trainable', False) - self.variables[name] = variable - return variable - - self.retrieve_indices = tf.make_template( - name_=(scope + '/retrieve_indices'), - func_=self.tf_retrieve_indices, - custom_getter_=custom_getter - ) - - def tf_initialize(self): - # States - self.states_memory = dict() - for name, state in self.states_spec.items(): - self.states_memory[name] = tf.get_variable( - name=('state-' + name), - shape=(self.capacity,) + tuple(state['shape']), - dtype=util.tf_dtype(state['type']), - trainable=False - ) - - # Internals - self.internals_memory = dict() - for name, internal in self.internals_spec.items(): - self.internals_memory[name] = tf.get_variable( - name=('internal-' + name), - shape=(self.capacity,) + tuple(internal['shape']), - dtype=util.tf_dtype(internal['type']), - trainable=False - ) - - # Actions - self.actions_memory = dict() - for name, action in self.actions_spec.items(): - self.actions_memory[name] = tf.get_variable( - name=('action-' + name), - shape=(self.capacity,) + tuple(action['shape']), - dtype=util.tf_dtype(action['type']), - trainable=False - ) - - # Terminal - self.terminal_memory = tf.get_variable( - name='terminal', - shape=(self.capacity,), - dtype=util.tf_dtype('bool'), - initializer=tf.constant_initializer( - value=tuple(n == self.capacity - 1 for n in range(self.capacity)), - dtype=util.tf_dtype('bool') - ), - trainable=False - ) - - # Reward - self.reward_memory = tf.get_variable( - name='reward', - shape=(self.capacity,), - dtype=util.tf_dtype('float'), - trainable=False - ) - - # Memory index - self.memory_index = tf.get_variable( - name='memory-index', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) - - # Priorities - self.priorities = tf.get_variable( - name='priorities', - shape=(self.capacity,), - dtype=util.tf_dtype('float'), - trainable=False - ) - - # Buffer variables. The buffer is used to insert data for which we - # do not have priorities yet. - self.buffer_index = tf.get_variable( - name='buffer-index', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) - - self.states_buffer = dict() - for name, state in self.states_spec.items(): - self.states_buffer[name] = tf.get_variable( - name=('state-buffer-' + name), - shape=(self.buffer_size,) + tuple(state['shape']), - dtype=util.tf_dtype(state['type']), - trainable=False - ) - - # Internals - self.internals_buffer = dict() - for name, internal in self.internals_spec.items(): - self.internals_buffer[name] = tf.get_variable( - name=('internal-buffer-' + name), - shape=(self.capacity,) + tuple(internal['shape']), - dtype=util.tf_dtype(internal['type']), - trainable=False - ) - - # Actions - self.actions_buffer = dict() - for name, action in self.actions_spec.items(): - self.actions_buffer[name] = tf.get_variable( - name=('action-buffer-' + name), - shape=(self.buffer_size,) + tuple(action['shape']), - dtype=util.tf_dtype(action['type']), - trainable=False - ) - - # Terminal - self.terminal_buffer = tf.get_variable( - name='terminal-buffer', - shape=(self.capacity,), - dtype=util.tf_dtype('bool'), - initializer=tf.constant_initializer( - value=tuple(n == self.buffer_size - 1 for n in range(self.capacity)), - dtype=util.tf_dtype('bool') - ), - trainable=False - ) - - # Reward - self.reward_buffer = tf.get_variable( - name='reward-buffer', - shape=(self.buffer_size,), - dtype=util.tf_dtype('float'), - trainable=False - ) - - # Indices of batch experiences in main memory. - self.batch_indices = tf.get_variable( - name='batch-indices', - dtype=util.tf_dtype('int'), - shape=(self.capacity,), - trainable=False - ) - - # Indices of batch experiences in buffer.. - self.last_batch_buffer_elems = tf.get_variable( - name='last-batch-buffer-elems', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) - - def tf_store(self, states, internals, actions, terminal, reward): - # We first store new experiences into a buffer that is separate from main memory. - # We insert these into the main memory once we have computed priorities on a given batch. - num_instances = tf.shape(input=terminal)[0] - start_index = self.buffer_index - end_index = self.buffer_index + num_instances - - # Assign new observations. - assignments = list() - for name, state in states.items(): - assignments.append(tf.assign(ref=self.states_buffer[name][start_index:end_index], value=state)) - for name, internal in internals.items(): - assignments.append(tf.assign( - ref=self.internals_buffer[name][start_index:end_index], - value=internal - )) - for name, action in actions.items(): - assignments.append(tf.assign(ref=self.actions_buffer[name][start_index:end_index], value=action)) - - assignments.append(tf.assign(ref=self.terminal_buffer[start_index:end_index], value=terminal)) - assignments.append(tf.assign(ref=self.reward_buffer[start_index:end_index], value=reward)) - - # Increment memory index. - with tf.control_dependencies(control_inputs=assignments): - assignment = tf.assign(ref=self.buffer_index, value=(self.buffer_index + num_instances)) - - with tf.control_dependencies(control_inputs=(assignment,)): - return tf.no_op() - - def tf_retrieve_timesteps(self, n): - num_buffer_elems = tf.minimum(x=self.buffer_index, y=n) - num_priority_elements = n - num_buffer_elems - - def sampling_fn(): - # Vectorized sampling. - sum_priorities = tf.reduce_sum(input_tensor=self.priorities, axis=0) - sample = tf.random_uniform(shape=(num_priority_elements,), dtype=tf.float32) - indices = tf.zeros(shape=(num_priority_elements,), dtype=tf.int32) - - def cond(loop_index, sample): - return tf.reduce_all(input_tensor=(sample <= 0.0)) - - def sampling_body(loop_index, sample): - priority = tf.gather(params=self.priorities, indices=loop_index) - sample -= priority / sum_priorities - loop_index += tf.cast( - x=(sample > 0.0), - dtype=tf.int32, - ) - - return loop_index, sample - - priority_indices = tf.while_loop( - cond=cond, - body=sampling_body, - loop_vars=(indices, sample) - )[0] - return priority_indices - - priority_indices = tf.cond( - pred=num_priority_elements > 0, - true_fn=sampling_fn, - false_fn=lambda: tf.zeros(shape=(num_priority_elements,), dtype=tf.int32) - ) - priority_terminal = tf.gather(params=self.terminal_memory, indices=priority_indices) - priority_indices = tf.boolean_mask(tensor=priority_indices, mask=tf.logical_not(x=priority_terminal)) - - # Store how many elements we retrieved from the buffer for updating priorities. - # Note that this is just the count, as we can reconstruct the indices from that. - assignments = list() - assignments.append(tf.assign(ref=self.last_batch_buffer_elems, value=num_buffer_elems)) - - # Store indices used from priority memory. Note that these are the full indices - # as they were not taken in order. - assignments.append(tf.scatter_update( - ref=self.batch_indices, - indices=priority_indices, - updates=tf.ones(shape=tf.shape(input=priority_indices), dtype=tf.int32)) - ) - # Fetch results. - with tf.control_dependencies(control_inputs=assignments): - return self.retrieve_indices(buffer_elements=num_buffer_elems, priority_indices=priority_indices) - - def tf_retrieve_indices(self, buffer_elements, priority_indices): - """ - Fetches experiences for given indices by combining entries from buffer - which have no priorities, and entries from priority memory. - - Args: - buffer_elements: Number of buffer elements to retrieve - priority_indices: Index tensor for priority memory - - Returns: Batch of experiences - """ - states = dict() - - buffer_start = (self.buffer_index - buffer_elements) - buffer_start = tf.Print(buffer_start, [buffer_start], 'buffer start=', summarize=100) - buffer_end = (self.buffer_index) - buffer_end = tf.Print(buffer_end, [buffer_end], 'buffer_end=', summarize=100) - # Fetch entries from respective memories, concat. - for name, state_memory in self.states_memory.items(): - buffer_state_memory = self.states_buffer[name] - buffer_states = buffer_state_memory[buffer_start:buffer_end] - memory_states = tf.gather(params=state_memory, indices=priority_indices) - # buffer_states = tf.Print(buffer_states, [buffer_states], "buffer states=", summarize=100) - # memory_states = tf.Print(memory_states, [memory_states], "memory states=", summarize=100) - states[name] = tf.concat(values=(buffer_states, memory_states), axis=0) - - internals = dict() - for name, internal_memory in self.internals_memory.items(): - internal_buffer_memory = self.internals_buffer[name] - buffer_internals = internal_buffer_memory[buffer_start:buffer_end] - memory_internals = tf.gather(params=internal_memory, indices=priority_indices) - internals[name] = tf.concat(values=(buffer_internals, memory_internals), axis=0) - - actions = dict() - for name, action_memory in self.actions_memory.items(): - action_buffer_memory = self.actions_buffer[name] - buffer_action = action_buffer_memory[buffer_start:buffer_end] - memory_action = tf.gather(params=action_memory, indices=priority_indices) - actions[name] = tf.concat(values=(buffer_action, memory_action), axis=0) - - buffer_terminal = self.terminal_buffer[buffer_start:buffer_end] - priority_terminal = tf.gather(params=self.terminal_memory, indices=priority_indices) - terminal = tf.concat(values=(buffer_terminal, priority_terminal), axis=0) - - buffer_reward = self.reward_buffer[buffer_start:buffer_end] - priority_reward = tf.gather(params=self.reward_memory, indices=priority_indices) - reward = tf.concat(values=(buffer_reward, priority_reward), axis=0) - - if self.include_next_states: - assert util.rank(priority_indices) == 1 - next_priority_indices = (priority_indices + 1) % self.capacity - next_buffer_start = (buffer_start + 1) % self.buffer_size - next_buffer_end = (buffer_end + 1) % self.buffer_size - # else: - # next_indices = (indices[:, -1] + 1) % self.capacity - - next_states = dict() - for name, state_memory in self.states_memory.items(): - buffer_state_memory = self.states_buffer[name] - buffer_next_states = buffer_state_memory[next_buffer_start:next_buffer_end] - memory_next_states = tf.gather(params=state_memory, indices=next_priority_indices) - next_states[name] = tf.concat(values=(buffer_next_states, memory_next_states), axis=0) - - next_internals = dict() - for name, internal_memory in self.internals_memory.items(): - buffer_internal_memory = self.internals_buffer[name] - buffer_next_internals = buffer_internal_memory[next_buffer_start:next_buffer_end] - memory_next_internals = tf.gather(params=internal_memory, indices=next_priority_indices) - next_internals[name] = tf.concat(values=(buffer_next_internals, memory_next_internals), axis=0) - - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - else: - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - def tf_update_batch(self, loss_per_instance): - # 1. We reconstruct the batch from the buffer and the priority memory. - mask = tf.not_equal( - x=self.batch_indices, - y=tf.zeros(shape=tf.shape(input=self.batch_indices), dtype=tf.int32) - ) - priority_indices = tf.squeeze(tf.where(condition=mask)) - priority_indices = tf.Print(priority_indices, [priority_indices], message="Priority indices") - sampled_batch = self.tf_retrieve_indices( - buffer_elements=self.last_batch_buffer_elems, - priority_indices=priority_indices - ) - #sampled_batch = tf.Print(sampled_batch, [sampled_batch], message="sampled batch: ") - states = sampled_batch['states'] - internals = sampled_batch['internals'] - actions = sampled_batch['actions'] - terminal = sampled_batch['terminal'] - reward = sampled_batch['reward'] - - # TODO this is incorrect - start_index = 0 - end_index = self.last_batch_buffer_elems - priorities = loss_per_instance ** self.prioritization_weight - # How do we map batch indices to memory indices and insert? - - # For testing retrieval loop, no priority inserts yet. - assignments = list() - for name, state in states.items(): - assignments.append(tf.assign(ref=self.states_memory[name][start_index:end_index], value=state)) - for name, internal in internals.items(): - assignments.append(tf.assign( - ref=self.internals_buffer[name][start_index:end_index], - value=internal - )) - assignments.append(tf.assign(ref=self.terminal_memory[start_index:end_index], value=terminal)) - assignments.append(tf.assign(ref=self.reward_memory[start_index:end_index], value=reward)) - assignments.append(tf.assign(ref=self.priorities[start_index:end_index], value=priorities)) - - for name, action in actions.items(): - assignments.append(tf.assign(ref=self.actions_memory[name][start_index:end_index], value=action)) - - # 2. We delete entries from the priority memory. There is no need - # to delete entries from the buffer because we just move the idnex. - - # Start index for inserting - # buffer_end_insert = tf.constant(value=) - - # Reset buffer index. - with tf.control_dependencies(control_inputs=assignments): - assignment = tf.assign_sub(ref=self.buffer_index, value=self.last_batch_buffer_elems) - with tf.control_dependencies(control_inputs=(assignment,)): - return tf.no_op() - - def tf_retrieve_episodes(self, n): - pass - - def tf_retrieve_sequences(self, n, sequence_length): - pass diff --git a/tensorforce/core/memories/queue.py b/tensorforce/core/memories/queue.py index c7410c067..beb0ddaea 100755 --- a/tensorforce/core/memories/queue.py +++ b/tensorforce/core/memories/queue.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,245 +13,448 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - +import numpy as np import tensorflow as tf -from tensorforce import util +from tensorforce import TensorforceError, util +from tensorforce.core import TensorDict, TensorSpec, tf_function, tf_util, VariableDict from tensorforce.core.memories import Memory class Queue(Memory): """ - Base class for memories organized as a queue (FIFO). + Base class for memories organized as a queue / circular buffer. + + Args: + capacity (int > 0): Memory capacity + (default: minimum capacity). + device (string): Device name + (default: CPU:0). + name (string): internal use. + values_spec (specification): internal use. + min_capacity (int >= 0): internal use. """ - def __init__(self, states, internals, actions, include_next_states, capacity, scope='queue', summary_labels=None): - """ - Queue memory. - - Args: - states: States specifiction. - internals: Internal states specification. - actions: Actions specification. - include_next_states: Include subsequent state if true. - capacity: Memory capacity. - """ - super(Queue, self).__init__( - states=states, - internals=internals, - actions=actions, - include_next_states=include_next_states, - scope=scope, - summary_labels=summary_labels - ) - self.capacity = capacity - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - assert not kwargs.get('trainable', False) - self.variables[name] = variable - return variable - - self.retrieve_indices = tf.make_template( - name_=(scope + '/retrieve_indices'), - func_=self.tf_retrieve_indices, - custom_getter_=custom_getter + # (requires capacity as first argument) + def __init__( + self, capacity=None, *, device='CPU', name=None, values_spec=None, min_capacity=None + ): + super().__init__( + device=device, name=name, values_spec=values_spec, min_capacity=min_capacity ) - def tf_initialize(self): - # States - self.states_memory = dict() - for name, state in self.states_spec.items(): - self.states_memory[name] = tf.get_variable( - name=('state-' + name), - shape=(self.capacity,) + tuple(state['shape']), - dtype=util.tf_dtype(state['type']), - trainable=False + if capacity is None: + if self.min_capacity is None: + raise TensorforceError.required( + name='memory', argument='capacity', condition='unknown minimum capacity' + ) + else: + self.capacity = self.min_capacity + elif capacity < self.min_capacity: + raise TensorforceError.value( + name='memory', argument='capacity', value=capacity, + hint=('< minimum capacity ' + str(self.min_capacity)) ) - - # Internals - self.internals_memory = dict() - for name, internal in self.internals_spec.items(): - self.internals_memory[name] = tf.get_variable( - name=('internal-' + name), - shape=(self.capacity,) + tuple(internal['shape']), - dtype=util.tf_dtype(internal['type']), - trainable=False + else: + self.capacity = capacity + + def initialize(self): + super().initialize() + + # Value buffers + def function(name, spec): + spec = TensorSpec(type=spec.type, shape=((self.capacity,) + spec.shape)) + if name == 'terminal': + initializer = np.zeros(shape=(self.capacity,), dtype=spec.np_type()) + initializer[-1] = 1 + else: + initializer = 'zeros' + return self.variable( + name=(name + '-buffer'), spec=spec, initializer=initializer, is_trainable=False, + is_saved=True ) - # Actions - self.actions_memory = dict() - for name, action in self.actions_spec.items(): - self.actions_memory[name] = tf.get_variable( - name=('action-' + name), - shape=(self.capacity,) + tuple(action['shape']), - dtype=util.tf_dtype(action['type']), - trainable=False - ) + self.buffers = self.values_spec.fmap(function=function, cls=VariableDict, with_names=True) - # Terminal - self.terminal_memory = tf.get_variable( - name='terminal', - shape=(self.capacity,), - dtype=util.tf_dtype('bool'), - initializer=tf.constant_initializer( - value=tuple(n == self.capacity - 1 for n in range(self.capacity)), - dtype=util.tf_dtype('bool') - ), - trainable=False + # Buffer index (modulo capacity, next index to write to) + self.buffer_index = self.variable( + name='buffer-index', spec=TensorSpec(type='int'), initializer='zeros', + is_trainable=False, is_saved=True ) - # Reward - self.reward_memory = tf.get_variable( - name='reward', - shape=(self.capacity,), - dtype=util.tf_dtype('float'), - trainable=False + # Terminal indices + # (oldest episode terminals first, initially the only terminal is last index) + initializer = np.zeros(shape=(self.capacity + 1,), dtype=util.np_dtype(dtype='int')) + initializer[0] = self.capacity - 1 + self.terminal_indices = self.variable( + name='terminal-indices', spec=TensorSpec(type='int', shape=(self.capacity + 1,)), + initializer=initializer, is_trainable=False, is_saved=True ) - # Memory index - self.memory_index = tf.get_variable( - name='memory-index', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False + # Episode count + self.episode_count = self.variable( + name='episode-count', spec=TensorSpec(type='int'), initializer='zeros', + is_trainable=False, is_saved=True ) - # Episode indices - self.episode_indices = tf.get_variable( - name='episode-indices', - shape=(self.capacity + 1,), - dtype=util.tf_dtype('int'), - initializer=tf.constant_initializer(value=(self.capacity - 1), dtype=util.tf_dtype('int')), - trainable=False - ) + @tf_function(num_args=0) + def reset(self): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + three = tf_util.constant(value=3, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity) + + def correct_terminal(): + # Replace last observation terminal marker with abort terminal + dependencies = list() + two = tf_util.constant(value=2, dtype='int', shape=(1,)) + updates = tf.expand_dims(input=last_index, axis=0) # updates for below + indices = tf.expand_dims(input=updates, axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.buffers['terminal'], indices=indices, updates=two + ) + dependencies.append(self.buffers['terminal'].assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=two, indices=last_index) + # dependencies.append(self.buffers['terminal'].scatter_update(sparse_delta=sparse_delta)) + indices = tf.expand_dims( + input=tf.expand_dims(input=(self.episode_count + one), axis=0), axis=1 + ) + value = tf.tensor_scatter_nd_update( + tensor=self.terminal_indices, indices=indices, updates=updates + ) + dependencies.append(self.terminal_indices.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=last_index, indices=(self.episode_count + one)) + # dependencies.append(self.terminal_indices.scatter_update(sparse_delta=sparse_delta)) + with tf.control_dependencies(control_inputs=dependencies): + return self.episode_count.assign_add(delta=one, read_value=False) + + last_terminal = tf.gather(params=self.buffers['terminal'], indices=last_index) + is_incorrect = tf.math.equal(x=last_terminal, y=three) + corrected = tf.cond(pred=is_incorrect, true_fn=correct_terminal, false_fn=tf.no_op) + + with tf.control_dependencies(control_inputs=(corrected,)): + assertions = [corrected] + if self.config.create_tf_assertions: + # general check: all terminal indices true + assertions.append(tf.debugging.assert_equal( + x=tf.reduce_all( + input_tensor=tf.gather( + params=tf.math.greater(x=self.buffers['terminal'], y=zero), + indices=self.terminal_indices[:self.episode_count + one] + ) + ), + y=tf_util.constant(value=True, dtype='bool'), + message="Memory consistency check." + )) + # general check: only terminal indices true + assertions.append(tf.debugging.assert_equal( + x=tf.math.count_nonzero( + input=self.buffers['terminal'], dtype=tf_util.get_dtype(type='int') + ), + y=(self.episode_count + one), message="Memory consistency check." + )) - # Episodes index - self.episode_count = tf.get_variable( - name='episode-count', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) + with tf.control_dependencies(control_inputs=assertions): + return one < zero + + @tf_function(num_args=6) + def enqueue(self, *, states, internals, auxiliaries, actions, terminal, reward): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + three = tf_util.constant(value=3, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + num_timesteps = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') + + last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity) + + def correct_terminal(): + # Remove last observation terminal marker + updates = tf_util.constant(value=0, dtype='int', shape=(1,)) + indices = tf.expand_dims(input=tf.expand_dims(input=last_index, axis=0), axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.buffers['terminal'], indices=indices, updates=updates + ) + assignment = self.buffers['terminal'].assign(value=value) + # sparse_delta = tf.IndexedSlices(values=zero, indices=last_index) + # assignment = self.buffers['terminal'].scatter_update(sparse_delta=sparse_delta) + with tf.control_dependencies(control_inputs=(assignment,)): + return last_index < zero + + last_terminal = tf.gather(params=self.buffers['terminal'], indices=last_index) + is_incorrect = tf.math.equal(x=last_terminal, y=three) + corrected = tf.cond(pred=is_incorrect, true_fn=correct_terminal, false_fn=tf.no_op) + + # Assertions + assertions = [corrected] + if self.config.create_tf_assertions: + with tf.control_dependencies(control_inputs=(corrected,)): + # check: number of timesteps fit into effectively available buffer + assertions.append(tf.debugging.assert_less_equal( + x=num_timesteps, y=capacity, message="Memory does not have enough capacity." + )) + # at most one terminal + assertions.append(tf.debugging.assert_less_equal( + x=tf.math.count_nonzero(input=terminal, dtype=tf_util.get_dtype(type='int')), + y=one, message="Timesteps contain more than one terminal." + )) + # if terminal, last timestep in batch + assertions.append(tf.debugging.assert_equal( + x=tf.math.reduce_any(input_tensor=tf.math.greater(x=terminal, y=zero)), + y=tf.math.greater(x=terminal[-1], y=zero), + message="Terminal is not the last timestep." + )) + # general check: all terminal indices true + assertions.append(tf.debugging.assert_equal( + x=tf.reduce_all( + input_tensor=tf.gather( + params=tf.math.greater(x=self.buffers['terminal'], y=zero), + indices=self.terminal_indices[:self.episode_count + one] + ) + ), + y=tf_util.constant(value=True, dtype='bool'), + message="Memory consistency check." + )) + # general check: only terminal indices true + assertions.append(tf.debugging.assert_equal( + x=tf.math.count_nonzero( + input=self.buffers['terminal'], dtype=tf_util.get_dtype(type='int') + ), + y=(self.episode_count + one), message="Memory consistency check." + )) - def tf_store(self, states, internals, actions, terminal, reward): - # Memory indices to overwrite. - num_instances = tf.shape(input=terminal)[0] - indices = tf.range(start=self.memory_index, limit=(self.memory_index + num_instances)) % self.capacity + # Buffer indices to overwrite + with tf.control_dependencies(control_inputs=assertions): + overwritten_indices = tf.range( + start=self.buffer_index, limit=(self.buffer_index + num_timesteps) + ) + overwritten_indices = tf.math.mod(x=overwritten_indices, y=capacity) - # Remove episode indices. - num_episodes = tf.count_nonzero( - input_tensor=tf.gather(params=self.terminal_memory, indices=indices), - axis=0, - dtype=util.tf_dtype('int') - ) - num_episodes = tf.minimum(x=num_episodes, y=self.episode_count) - assignment = tf.assign( - ref=self.episode_indices[:self.episode_count + 1 - num_episodes], - value=self.episode_indices[num_episodes: self.episode_count + 1] - ) + # Count number of overwritten episodes + num_episodes = tf.math.count_nonzero( + input=tf.gather(params=self.buffers['terminal'], indices=overwritten_indices), + axis=0, dtype=tf_util.get_dtype(type='int') + ) + + # Shift remaining terminal indices accordingly + index = self.episode_count + one + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal( + x=index, y=num_episodes, message="Memory episode overwriting check." + )) - # Decrement episode count. + with tf.control_dependencies(control_inputs=assertions): + updates = self.terminal_indices[num_episodes: index] + indices = tf.expand_dims(input=tf.range(index - num_episodes), axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.terminal_indices, indices=indices, updates=updates + ) + assignment = self.terminal_indices.assign(value=value) + # sparse_delta = tf.IndexedSlices( + # values=self.terminal_indices[num_episodes: index], + # indices=tf.range(index - num_episodes) + # ) + # assignment = self.terminal_indices.scatter_update(sparse_delta=sparse_delta) + + # Decrement episode count accordingly with tf.control_dependencies(control_inputs=(assignment,)): - assignment = tf.assign_sub(ref=self.episode_count, value=num_episodes) + assignment = self.episode_count.assign_sub(delta=num_episodes, read_value=False) - # Assign new observations. + # Write new observations with tf.control_dependencies(control_inputs=(assignment,)): - assignments = list() - for name, state in states.items(): - assignments.append(tf.scatter_update( - ref=self.states_memory[name], - indices=indices, - updates=state - )) - for name, internal in internals.items(): - assignments.append(tf.scatter_update( - ref=self.internals_memory[name], - indices=indices, - updates=internal - )) - for name, action in actions.items(): - assignments.append(tf.scatter_update( - ref=self.actions_memory[name], - indices=indices, - updates=action - )) - assignments.append(tf.scatter_update(ref=self.terminal_memory, indices=indices, updates=terminal)) - assignments.append(tf.scatter_update(ref=self.reward_memory, indices=indices, updates=reward)) + # Add last observation terminal marker + corrected_terminal = tf.where( + condition=tf.math.equal(x=terminal[-1:], y=zero), x=three, y=terminal[-1:] + ) + corrected_terminal = tf.concat(values=(terminal[:-1], corrected_terminal), axis=0) + values = TensorDict( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=corrected_terminal, reward=reward + ) + indices = tf.range(start=self.buffer_index, limit=(self.buffer_index + num_timesteps)) + indices = tf.math.mod(x=indices, y=capacity) + indices = tf.expand_dims(input=indices, axis=1) + + def function(buffer, value): + value = tf.tensor_scatter_nd_update(tensor=buffer, indices=indices, updates=value) + return buffer.assign(value=value) + # sparse_delta = tf.IndexedSlices(values=value, indices=indices) + # return buffer.scatter_update(sparse_delta=sparse_delta) - # Increment memory index. + assignments = self.buffers.fmap(function=function, cls=list, zip_values=values) + + # Increment buffer index with tf.control_dependencies(control_inputs=assignments): - assignment = tf.assign(ref=self.memory_index, value=((self.memory_index + num_instances) % self.capacity)) + assignment = self.buffer_index.assign_add(delta=num_timesteps, read_value=False) - # Add episode indices. + # Count number of new episodes with tf.control_dependencies(control_inputs=(assignment,)): - num_episodes = tf.count_nonzero(input_tensor=terminal, axis=0, dtype=util.tf_dtype('int')) - assignment = tf.assign( - ref=self.episode_indices[self.episode_count + 1: self.episode_count + 1 + num_episodes], - value=tf.boolean_mask(tensor=indices, mask=terminal) + num_new_episodes = tf.math.count_nonzero( + input=terminal, dtype=tf_util.get_dtype(type='int') ) - # Increment episode count. - with tf.control_dependencies(control_inputs=(assignment,)): - assignment = tf.assign_add(ref=self.episode_count, value=num_episodes) - + # Write new terminal indices + new_terminal_indices = tf.boolean_mask( + tensor=overwritten_indices, mask=tf.math.greater(x=terminal, y=zero) + ) + start = self.episode_count + one + indices = tf.expand_dims( + input=tf.range(start=start, limit=(start + num_new_episodes)), axis=1 + ) + value = tf.tensor_scatter_nd_update( + tensor=self.terminal_indices, indices=indices, updates=new_terminal_indices + ) + assignment = self.terminal_indices.assign(value=value) + # sparse_delta = tf.IndexedSlices( + # values=new_terminal_indices, + # indices=tf.range(start=start, limit=(start + num_new_episodes)) + # ) + # assignment = self.terminal_indices.scatter_update(sparse_delta=sparse_delta) + + # Increment episode count accordingly with tf.control_dependencies(control_inputs=(assignment,)): - return tf.no_op() - - def tf_retrieve_indices(self, indices): - """ - Fetches experiences for given indices. - - Args: - indices: Index tensor - - Returns: Batch of experiences - """ - states = dict() - for name, state_memory in self.states_memory.items(): - states[name] = tf.gather(params=state_memory, indices=indices) - - internals = dict() - for name, internal_memory in self.internals_memory.items(): - internals[name] = tf.gather(params=internal_memory, indices=indices) - - actions = dict() - for name, action_memory in self.actions_memory.items(): - actions[name] = tf.gather(params=action_memory, indices=indices) - - terminal = tf.gather(params=self.terminal_memory, indices=indices) - reward = tf.gather(params=self.reward_memory, indices=indices) - - if self.include_next_states: - assert util.rank(indices) == 1 - next_indices = (indices + 1) % self.capacity - - next_states = dict() - for name, state_memory in self.states_memory.items(): - next_states[name] = tf.gather(params=state_memory, indices=next_indices) - - next_internals = dict() - for name, internal_memory in self.internals_memory.items(): - next_internals[name] = tf.gather(params=internal_memory, indices=next_indices) - - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals + assignment = self.episode_count.assign_add(delta=num_new_episodes) + return assignment < zero + + @tf_function(num_args=1) + def retrieve(self, *, indices, values): + assert isinstance(values, tuple) + function = (lambda x: tf.gather(params=x, indices=indices)) + return self.buffers[values].fmap(function=function, cls=TensorDict) + + @tf_function(num_args=2) + def predecessors(self, *, indices, horizon, sequence_values, initial_values): + assert isinstance(sequence_values, tuple) + assert isinstance(initial_values, tuple) + + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + def body(lengths, predecessor_indices, mask): + previous_index = tf.math.mod(x=(predecessor_indices[:, :1] - one), y=capacity) + predecessor_indices = tf.concat(values=(previous_index, predecessor_indices), axis=1) + previous_terminal = tf.gather(params=self.buffers['terminal'], indices=previous_index) + is_not_terminal = tf.math.logical_and( + x=tf.math.logical_not(x=tf.math.greater(x=previous_terminal, y=zero)), + y=mask[:, :1] ) + mask = tf.concat(values=(is_not_terminal, mask), axis=1) + is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1) + zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) + ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) + lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros) + return lengths, predecessor_indices, mask + + lengths = tf.ones_like(input=indices, dtype=tf_util.get_dtype(type='int')) + predecessor_indices = tf.math.mod(x=tf.expand_dims(input=indices, axis=1), y=capacity) + mask = tf.ones_like(input=predecessor_indices, dtype=tf_util.get_dtype(type='bool')) + shape = tf.TensorShape(dims=((None, None))) + + lengths, predecessor_indices, mask = tf.while_loop( + cond=tf_util.always_true, body=body, loop_vars=(lengths, predecessor_indices, mask), + shape_invariants=(lengths.get_shape(), shape, shape), + maximum_iterations=tf_util.int32(x=horizon) + ) + + predecessor_indices = tf.reshape(tensor=predecessor_indices, shape=(-1,)) + mask = tf.reshape(tensor=mask, shape=(-1,)) + predecessor_indices = tf.boolean_mask(tensor=predecessor_indices, mask=mask, axis=0) + + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal( + x=tf.math.mod(x=(predecessor_indices - self.buffer_index), y=capacity), y=zero, + message="Predecessor check." + )) + + with tf.control_dependencies(control_inputs=assertions): + function = (lambda buffer: tf.gather(params=buffer, indices=predecessor_indices)) + sequence_values = self.buffers[sequence_values].fmap(function=function, cls=TensorDict) + + starts = tf.math.cumsum(x=lengths, exclusive=True) + initial_indices = tf.gather(params=predecessor_indices, indices=starts) + function = (lambda buffer: tf.gather(params=buffer, indices=initial_indices)) + initial_values = self.buffers[initial_values].fmap(function=function, cls=TensorDict) + + if len(sequence_values) == 0: + if len(initial_values) == 0: + return lengths + else: + return lengths, initial_values + + elif len(initial_values) == 0: + return tf.stack(values=(starts, lengths), axis=1), sequence_values + else: - return dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward + return tf.stack(values=(starts, lengths), axis=1), sequence_values, initial_values + + @tf_function(num_args=2) + def successors(self, *, indices, horizon, sequence_values, final_values): + assert isinstance(sequence_values, tuple) + assert isinstance(final_values, tuple) + + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + def body(lengths, successor_indices, mask): + current_index = successor_indices[:, -1:] + current_terminal = tf.gather(params=self.buffers['terminal'], indices=current_index) + is_not_terminal = tf.math.logical_and( + x=tf.math.logical_not(x=tf.math.greater(x=current_terminal, y=zero)), + y=mask[:, -1:] ) + next_index = tf.math.mod(x=(current_index + one), y=capacity) + successor_indices = tf.concat(values=(successor_indices, next_index), axis=1) + mask = tf.concat(values=(mask, is_not_terminal), axis=1) + is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1) + zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) + ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) + lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros) + return lengths, successor_indices, mask + + lengths = tf.ones_like(input=indices, dtype=tf_util.get_dtype(type='int')) + successor_indices = tf.math.mod(x=tf.expand_dims(input=indices, axis=1), y=capacity) + mask = tf.ones_like(input=successor_indices, dtype=tf_util.get_dtype(type='bool')) + shape = tf.TensorShape(dims=((None, None))) + + lengths, successor_indices, mask = tf.while_loop( + cond=tf_util.always_true, body=body, loop_vars=(lengths, successor_indices, mask), + shape_invariants=(lengths.get_shape(), shape, shape), + maximum_iterations=tf_util.int32(x=horizon) + ) + + successor_indices = tf.reshape(tensor=successor_indices, shape=(-1,)) + mask = tf.reshape(tensor=mask, shape=(-1,)) + successor_indices = tf.boolean_mask(tensor=successor_indices, mask=mask, axis=0) + + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal( + x=tf.math.mod(x=(self.buffer_index - one - successor_indices), y=capacity), y=zero, + message="Successor check." + )) + + with tf.control_dependencies(control_inputs=assertions): + function = (lambda buffer: tf.gather(params=buffer, indices=successor_indices)) + sequence_values = self.buffers[sequence_values].fmap(function=function, cls=TensorDict) + + starts = tf.math.cumsum(x=lengths, exclusive=True) + ends = tf.math.cumsum(x=lengths) - one + final_indices = tf.gather(params=successor_indices, indices=ends) + function = (lambda buffer: tf.gather(params=buffer, indices=final_indices)) + final_values = self.buffers[final_values].fmap(function=function, cls=TensorDict) + + if len(sequence_values) == 0: + if len(final_values) == 0: + return lengths + else: + return lengths, final_values + + elif len(final_values) == 0: + return tf.stack(values=(starts, lengths), axis=1), sequence_values + + else: + return tf.stack(values=(starts, lengths), axis=1), sequence_values, final_values diff --git a/tensorforce/core/memories/recent.py b/tensorforce/core/memories/recent.py new file mode 100644 index 000000000..32a5a60c4 --- /dev/null +++ b/tensorforce/core/memories/recent.py @@ -0,0 +1,85 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import tf_function, tf_util +from tensorforce.core.memories import Queue + + +class Recent(Queue): + """ + Batching memory which always retrieves most recent experiences (specification key: `recent`). + + Args: + capacity (int > 0): Memory capacity + (default: minimum capacity). + device (string): Device name + (default: CPU:0). + name (string): internal use. + values_spec (specification): internal use. + min_capacity (int >= 0): internal use. + """ + + @tf_function(num_args=3) + def retrieve_timesteps(self, *, n, past_horizon, future_horizon): + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + # Check whether memory contains at least one valid timestep + num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity) + num_timesteps -= (past_horizon + future_horizon) + num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count) + + # Check whether memory contains at least one timestep + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal(x=num_timesteps, y=one)) + + # Most recent timestep indices range + with tf.control_dependencies(control_inputs=assertions): + n = tf.math.minimum(x=n, y=num_timesteps) + indices = tf.range(start=(self.buffer_index - n), limit=self.buffer_index) + indices = tf.math.mod(x=(indices - future_horizon), y=capacity) + + return indices + + @tf_function(num_args=1) + def retrieve_episodes(self, *, n): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + # Check whether memory contains at least one episode + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal(x=self.episode_count, y=one)) + + # Get start and limit index for most recent n episodes + with tf.control_dependencies(control_inputs=assertions): + n = tf.math.minimum(x=n, y=self.episode_count) + + # (Increment terminal of previous episode) + start = self.terminal_indices[self.episode_count - n] + one + limit = self.terminal_indices[self.episode_count] + one + + # Correct limit index if smaller than start index + limit = limit + tf.where(condition=(limit < start), x=capacity, y=zero) + + # Most recent episode indices range + indices = tf.range(start=start, limit=limit) + indices = tf.math.mod(x=indices, y=capacity) + + return indices diff --git a/tensorforce/core/memories/replay.py b/tensorforce/core/memories/replay.py index 3f5110c34..a830db54f 100755 --- a/tensorforce/core/memories/replay.py +++ b/tensorforce/core/memories/replay.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,68 +13,80 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import tensorflow as tf +from tensorforce.core import tf_function, tf_util from tensorforce.core.memories import Queue class Replay(Queue): """ - Memory which randomly retrieves experiences. + Replay memory which randomly retrieves experiences (specification key: `replay`). + + Args: + capacity (int > 0): Memory capacity + (default: minimum capacity). + device (string): Device name + (default: CPU:0). + name (string): internal use. + values_spec (specification): internal use. + min_capacity (int >= 0): internal use. """ - def __init__(self, states, internals, actions, include_next_states, capacity, scope='replay', summary_labels=None): - """ - Replay memory. - - Args: - states: States specification. - internals: Internal states specification. - actions: Actions specification. - include_next_states: Include subsequent state if true. - capacity: Memory capacity. - """ - super(Replay, self).__init__( - states=states, - internals=internals, - actions=actions, - include_next_states=include_next_states, - capacity=capacity, - scope=scope, - summary_labels=summary_labels - ) - - def tf_retrieve_timesteps(self, n): - num_timesteps = (self.memory_index - self.episode_indices[0] - 2) % self.capacity + 1 - indices = tf.random_uniform(shape=(n,), maxval=num_timesteps, dtype=tf.int32) - indices = (self.memory_index - 1 - indices) % self.capacity - terminal = tf.gather(params=self.terminal_memory, indices=indices) - indices = tf.boolean_mask(tensor=indices, mask=tf.logical_not(x=terminal)) - return self.retrieve_indices(indices=indices) - - def tf_retrieve_episodes(self, n): - random_episode_indices = tf.random_uniform(shape=(n,), maxval=(self.episode_count + 1), dtype=tf.int32) - starts = tf.gather(params=self.episode_indices, indices=random_episode_indices) + 1 - limits = tf.gather(params=self.episode_indices, indices=(random_episode_indices + 1)) - limits += tf.where( - condition=(starts < limits), - x=tf.constant(value=0, shape=(n,)), - y=tf.constant(value=self.capacity, shape=(n,)) - ) - episodes = [tf.range(start=starts[k], limit=limits[k]) for k in range(n)] - indices = tf.concat(values=episodes, axis=0) % self.capacity - return self.retrieve_indices(indices=indices) - - def tf_retrieve_sequences(self, n, sequence_length): - num_sequences = (self.memory_index - self.episode_indices[0] - 2 - sequence_length + 1) % self.capacity + 1 - indices = tf.random_uniform(shape=(n,), maxval=num_sequences, dtype=tf.int32) - indices = (self.memory_index - 1 - indices - sequence_length) % self.capacity - sequence_indices = [tf.range(start=indices[k], limit=(indices[k] + sequence_length)) for k in range(n)] - sequence_indices = tf.concat(values=sequence_indices, axis=0) % self.capacity # tf.stack !!!!! - terminal = tf.gather(params=self.terminal_memory, indices=indices) - sequence_indices = tf.boolean_mask(tensor=sequence_indices, mask=tf.logical_not(x=terminal)) - return self.retrieve_indices(indices=sequence_indices) + @tf_function(num_args=3) + def retrieve_timesteps(self, *, n, past_horizon, future_horizon): + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + # Check whether memory contains at least one valid timestep + num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity) + num_timesteps -= (past_horizon + future_horizon) + num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count) + + # Check whether memory contains at least one timestep + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal(x=num_timesteps, y=one)) + + # Randomly sampled timestep indices + with tf.control_dependencies(control_inputs=assertions): + n = tf.math.minimum(x=n, y=num_timesteps) + indices = tf.random.uniform( + shape=(n,), maxval=num_timesteps, dtype=tf_util.get_dtype(type='int') + ) + indices = tf.math.mod( + x=(self.buffer_index - one - indices - future_horizon), y=capacity + ) + + return indices + + @tf_function(num_args=1) + def retrieve_episodes(self, *, n): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.capacity, dtype='int') + + # Check whether memory contains at least one episode + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal(x=self.episode_count, y=one)) + + # Get start and limit indices for randomly sampled n episodes + with tf.control_dependencies(control_inputs=assertions): + n = tf.math.minimum(x=n, y=self.episode_count) + random_indices = tf.random.uniform( + shape=(n,), maxval=self.episode_count, dtype=tf_util.get_dtype(type='int') + ) + + # (Increment terminal of previous episode) + starts = tf.gather(params=self.terminal_indices, indices=random_indices) + one + limits = tf.gather(params=self.terminal_indices, indices=(random_indices + one)) + one + + # Correct limit index if smaller than start index + limits = limits + tf.where(condition=(limits < starts), x=capacity, y=zero) + + # Random episode indices ranges + indices = tf.ragged.range(starts=starts, limits=limits).values + indices = tf.math.mod(x=indices, y=capacity) + + return indices diff --git a/tensorforce/core/explorations/constant.py b/tensorforce/core/models/__init__.py old mode 100755 new mode 100644 similarity index 57% rename from tensorforce/core/explorations/constant.py rename to tensorforce/core/models/__init__.py index 3f161f2ae..063c3fdfc --- a/tensorforce/core/explorations/constant.py +++ b/tensorforce/core/models/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,12 @@ # limitations under the License. # ============================================================================== -from tensorforce.core.explorations import Exploration +from tensorforce.core.models.model import Model +# Require Model +from tensorforce.core.models.constant import ConstantModel +from tensorforce.core.models.tensorforce import TensorforceModel +from tensorforce.core.models.random import RandomModel -class Constant(Exploration): - """ - Explore via adding a constant term. - """ - def __init__(self, constant=0.0, scope='constant', summary_labels=()): - self.constant = constant - super(Constant, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_explore(self, episode, timestep, action_spec=None): - return self.constant +__all__ = ['ConstantModel', 'Model', 'RandomModel', 'TensorforceModel'] diff --git a/tensorforce/core/models/constant.py b/tensorforce/core/models/constant.py new file mode 100644 index 000000000..1bb575a7c --- /dev/null +++ b/tensorforce/core/models/constant.py @@ -0,0 +1,112 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorDict, tf_function, tf_util +from tensorforce.core.models import Model + + +class ConstantModel(Model): + """ + Utility class to return constant actions of a desired shape and with given bounds. + """ + + def __init__( + self, *, states, actions, parallel_interactions, config, summarizer, tracking, action_values + ): + super().__init__( + states=states, actions=actions, l2_regularization=0.0, + parallel_interactions=parallel_interactions, config=config, saver=None, + summarizer=summarizer, tracking=tracking + ) + + self.action_values = dict() + if action_values is not None: + for name, spec in self.actions_spec.items(): + if name not in action_values: + continue + value = spec.py_type()(action_values[name]) + if spec.type != 'bool' and spec.min_value is not None and value < spec.min_value: + raise TensorforceError.value( + name='ConstantAgent', argument='action_values[{}]'.format(name), + value=value, hint='> max_value' + ) + if spec.type != 'bool' and spec.max_value is not None and value > spec.max_value: + raise TensorforceError.value( + name='ConstantAgent', argument='action_values[{}]'.format(name), + value=value, hint='> max_value' + ) + self.action_values[name] = value + + @tf_function(num_args=5) + def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): + assert len(internals) == 0 + + actions = TensorDict() + x = tf.shape(input=states.value())[:1] + for name, spec in self.actions_spec.items(): + shape = tf.concat(values=( + tf_util.cast(x=x, dtype='int'), + tf_util.constant(value=spec.shape, dtype='int') + ), axis=0) + + if self.action_values is not None and name in self.action_values: + # If user-specified, choose given action + action = tf_util.constant(value=self.action_values[name], dtype=spec.type) + actions[name] = tf.fill(dims=shape, value=action) + + elif self.config.enable_int_action_masking and spec.type == 'int' and \ + spec.num_values is not None: + # If masking, choose first unmasked action + mask = auxiliaries[name]['mask'] + choices = tf_util.constant( + value=list(range(spec.num_values)), dtype='int', + shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) + ) + one = tf_util.constant(value=1, dtype='int', shape=(1,)) + multiples = tf.concat(values=(shape, one), axis=0) + choices = tf.tile(input=choices, multiples=multiples) + choices = tf.boolean_mask(tensor=choices, mask=mask) + mask = tf_util.cast(x=mask, dtype='int') + num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) + num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) + masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) + action = tf.gather(params=choices, indices=masked_offset) + actions[name] = tf.reshape(tensor=action, shape=shape) + + elif spec.type != 'bool' and spec.min_value is not None: + if spec.max_value is not None: + # If min/max_value given, choose mean action + action = spec.min_value + 0.5 * (spec.max_value - spec.min_value) + action = tf_util.constant(value=action, dtype=spec.type) + actions[name] = tf.fill(dims=shape, value=action) + + else: + # If only min_value given, choose min_value + action = tf_util.constant(value=spec.min_value, dtype=spec.type) + actions[name] = tf.fill(dims=shape, value=action) + + elif spec.type != 'bool' and spec.max_value is not None: + # If only max_value given, choose max_value + action = tf_util.constant(value=spec.max_value, dtype=spec.type) + actions[name] = tf.fill(dims=shape, value=action) + + else: + # Else choose zero + actions[name] = tf_util.zeros(shape=shape, dtype=spec.type) + + return actions, TensorDict() diff --git a/tensorforce/core/models/model.py b/tensorforce/core/models/model.py new file mode 100644 index 000000000..32032bf54 --- /dev/null +++ b/tensorforce/core/models/model.py @@ -0,0 +1,979 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict +import logging +import os +import time + +import h5py +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import ArrayDict, Module, SignatureDict, TensorDict, TensorSpec, \ + TensorsSpec, tf_function, tf_util, VariableDict +from tensorforce.core.layers import Layer + + +class Model(Module): + + def __init__( + self, *, states, actions, l2_regularization, parallel_interactions, config, saver, + summarizer, tracking + ): + # Initialize global registries + setattr(Module, '_MODULE_STACK', list()) + setattr(Layer, '_REGISTERED_LAYERS', OrderedDict()) + + # Tensorforce config + self._config = config + + Module._MODULE_STACK.clear() + Module._MODULE_STACK.append(self.__class__) + + super().__init__( + device=self._config.device, l2_regularization=l2_regularization, name=self._config.name + ) + + assert self.l2_regularization is not None + self.is_trainable = True + self.is_saved = True + + # Keep track of tensor names to check for collisions + self.value_names = set() + + # Terminal specification + self.terminal_spec = TensorSpec(type='int', shape=(), num_values=3) + self.value_names.add('terminal') + + # Reward specification + self.reward_spec = TensorSpec(type='float', shape=()) + self.value_names.add('reward') + + # Parallel specification + self.parallel_spec = TensorSpec(type='int', shape=(), num_values=parallel_interactions) + self.value_names.add('parallel') + + # Deterministic specification + self.deterministic_spec = TensorSpec(type='bool', shape=()) + self.value_names.add('deterministic') + + # State space specification + self.states_spec = states + for name, spec in self.states_spec.items(): + name = ('' if name is None else ' ' + name) + if spec.type != 'float': + continue + elif spec.min_value is None: + logging.warning("No min_value bound specified for state{}.".format(name)) + elif np.isinf(spec.min_value).any(): + logging.warning("Infinite min_value bound for state{}.".format(name)) + elif spec.max_value is None: + logging.warning("No max_value bound specified for state{}.".format(name)) + elif np.isinf(spec.max_value).any(): + logging.warning("Infinite max_value bound for state{}.".format(name)) + + # Check for name collisions + if self.states_spec.is_singleton(): + if 'state' in self.value_names: + raise TensorforceError.exists(name='value name', value=name) + self.value_names.add('state') + else: + for name in self.states_spec: + if name in self.value_names: + raise TensorforceError.exists(name='value name', value=name) + self.value_names.add(name) + + # Action space specification + self.actions_spec = actions + for name, spec in self.actions_spec.items(): + name = ('' if name is None else ' ' + name) + if spec.type != 'float': + continue + elif spec.min_value is None: + logging.warning("No min_value specified for action{}.".format(name)) + elif np.isinf(spec.min_value).any(): + raise TensorforceError("Infinite min_value bound for action{}.".format(name)) + elif spec.max_value is None: + logging.warning("No max_value specified for action{}.".format(name)) + elif np.isinf(spec.max_value).any(): + raise TensorforceError("Infinite max_value bound for action{}.".format(name)) + + # Check for name collisions + if self.actions_spec.is_singleton(): + if 'action' in self.value_names: + raise TensorforceError.exists(name='value name', value=name) + self.value_names.add('action') + else: + for name in self.actions_spec: + if name in self.value_names: + raise TensorforceError.exists(name='value name', value=name) + self.value_names.add(name) + + # Internal state space specification + self.internals_spec = TensorsSpec() + self.initial_internals = ArrayDict() + + # Auxiliary value space specification + self.auxiliaries_spec = TensorsSpec() + for name, spec in self.actions_spec.items(): + if self.config.enable_int_action_masking and spec.type == 'int' and \ + spec.num_values is not None: + self.auxiliaries_spec[name] = TensorsSpec(mask=TensorSpec( + type='bool', shape=(spec.shape + (spec.num_values,)) + )) + + # Parallel interactions + assert isinstance(parallel_interactions, int) and parallel_interactions >= 1 + self.parallel_interactions = parallel_interactions + + # Saver + if isinstance(saver, str): + saver = dict(directory=saver) + if saver is None: + self.saver = None + elif not all(key in ( + 'directory', 'filename', 'frequency', 'load', 'max_checkpoints', 'max_hour_frequency', + 'unit' + ) for key in saver): + raise TensorforceError.value( + name='agent', argument='saver', value=list(saver), + hint='not from {directory,filename,frequency,load,max_checkpoints,' + 'max_hour_frequency,unit}' + ) + elif 'directory' not in saver: + raise TensorforceError.required(name='agent', argument='saver[directory]') + else: + self.saver = dict(saver) + + # Summarizer + if isinstance(summarizer, str): + summarizer = dict(directory=summarizer) + if summarizer is None: + self.summarizer = None + self.summaries = frozenset() + elif not all( + key in ('directory', 'filename', 'flush', 'max_summaries', 'summaries') + for key in summarizer + ): + raise TensorforceError.value( + name='agent', argument='summarizer', value=list(summarizer), + hint='not from {directory,filename,flush,max_summaries,summaries}' + ) + elif 'directory' not in summarizer: + raise TensorforceError.required(name='agent', argument='summarizer[directory]') + else: + self.summarizer = dict(summarizer) + + # Summary labels + summaries = summarizer.get('summaries') + if summaries is None or summaries == 'all': + self.summaries = 'all' + elif not all(isinstance(label, str) for label in summaries): + raise TensorforceError.value( + name='agent', argument='summarizer[summaries]', value=summaries + ) + else: + self.summaries = frozenset(summaries) + + # Tracking + if tracking is None: + self.tracking = frozenset() + elif tracking == 'all': + self.tracking = 'all' + else: + self.tracking = frozenset(tracking) + + def get_architecture(self): + raise NotImplementedError + + @property + def root(self): + return self + + @property + def config(self): + return self._config + + @property + def full_name(self): + return self.name + + def close(self): + if self.saver is not None: + self.save() + if self.summarizer is not None: + self.summarizer.close() + delattr(Module, '_MODULE_STACK') + delattr(Layer, '_REGISTERED_LAYERS') + + def __enter__(self): + assert self.is_initialized is not None + if self.is_initialized: + Module._MODULE_STACK.append(self) + else: + # Hack: keep non-empty module stack from constructor + assert len(Module._MODULE_STACK) == 1 and Module._MODULE_STACK[0] is self + self.device.__enter__() + self.name_scope.__enter__() + return self + + def __exit__(self, etype, exception, traceback): + self.name_scope.__exit__(etype, exception, traceback) + self.device.__exit__(etype, exception, traceback) + popped = Module._MODULE_STACK.pop() + assert popped is self + assert self.is_initialized is not None + if not self.is_initialized: + assert len(Module._MODULE_STACK) == 0 + + def initialize(self): + assert self.is_initialized is None + self.is_initialized = False + + with self: + + if self.summarizer is not None: + directory = self.summarizer['directory'] + if os.path.isdir(directory): + directories = sorted( + d for d in os.listdir(directory) + if os.path.isdir(os.path.join(directory, d)) and d.startswith('summary-') + ) + else: + os.makedirs(directory) + directories = list() + + filename = self.summarizer.get('filename') + if filename is None: + filename = time.strftime('summary-%Y%m%d-%H%M%S') + + max_summaries = self.summarizer.get('max_summaries') + if max_summaries is None: + max_summaries = 7 + if len(directories) > max_summaries - 1: + for subdir in directories[:len(directories) - max_summaries + 1]: + subdir = os.path.join(directory, subdir) + os.remove(os.path.join(subdir, os.listdir(subdir)[0])) + os.rmdir(subdir) + + logdir = os.path.join(directory, filename) + flush_millis = self.summarizer.get('flush') + if flush_millis is None: + flush_millis = 10000 + else: + flush_millis *= 1000 + # with tf.name_scope(name='summarizer'): + self.summarizer = tf.summary.create_file_writer( + logdir=logdir, max_queue=None, flush_millis=flush_millis, filename_suffix=None, + name='summarizer' + ) + + # TODO: write agent spec? + # tf.summary.text(name, data, step=None, description=None) + + super().initialize() + + self.core_initialize() + + # Units, used in: Parameter, Model.save(), Model.summarizer???? + self.units = dict( + timesteps=self.timesteps, episodes=self.episodes, updates=self.updates + ) + + # Checkpoint manager + if self.saver is not None: + self.saver_directory = self.saver['directory'] + self.saver_filename = self.saver.get('filename') + if self.saver_filename is None: + self.saver_filename = self.name + load = self.saver.get('load', False) + max_checkpoints = self.saver.get('max_checkpoints') + if max_checkpoints is None: + max_checkpoints = 10 + unit = self.saver.get('unit') + if unit is None: + unit = 'updates' + frequency = self.saver.get('frequency') + if frequency is None: + frequency = 10 + # with tf.name_scope(name='saver'): + self.checkpoint = tf.train.Checkpoint(**{self.name: self}) + self.saver = tf.train.CheckpointManager( + checkpoint=self.checkpoint, directory=self.saver_directory, + max_to_keep=max_checkpoints, + keep_checkpoint_every_n_hours=self.saver.get('max_hour_frequency'), + checkpoint_name=self.saver_filename, step_counter=self.units[unit], + checkpoint_interval=frequency, init_fn=None + ) + + self.is_initialized = True + + if self.summarizer is None: + self.initialize_api() + else: + with self.summarizer.as_default(): + self.initialize_api() + + if self.saver is not None: + if load: + self.restore() + else: + self.save() + + def core_initialize(self): + # Timestep counter + self.timesteps = self.variable( + name='timesteps', spec=TensorSpec(type='int'), initializer='zeros', is_trainable=False, + is_saved=True + ) + + # Episode counter + self.episodes = self.variable( + name='episodes', spec=TensorSpec(type='int'), initializer='zeros', is_trainable=False, + is_saved=True + ) + + # Update counter + self.updates = self.variable( + name='updates', spec=TensorSpec(type='int'), initializer='zeros', is_trainable=False, + is_saved=True + ) + + # Episode length/return + self.episode_length = self.variable( + name='episode-length', spec=TensorSpec(type='int', shape=(self.parallel_interactions,)), + initializer='zeros', is_trainable=False, is_saved=False + ) + self.episode_return = self.variable( + name='episode-return', + spec=TensorSpec(type=self.reward_spec.type, shape=(self.parallel_interactions,)), + initializer='zeros', is_trainable=False, is_saved=False + ) + + # Internals buffers + def function(name, spec, initial): + shape = (self.parallel_interactions,) + spec.shape + reps = (self.parallel_interactions,) + tuple(1 for _ in range(spec.rank)) + initializer = np.tile(np.expand_dims(initial, axis=0), reps=reps) + return self.variable( + name=(name + '-buffer'), spec=TensorSpec(type=spec.type, shape=shape), + initializer=initializer, is_trainable=False, is_saved=False + ) + + self.previous_internals = self.internals_spec.fmap( + function=function, cls=VariableDict, with_names=True, zip_values=self.initial_internals + ) + + def initialize_api(self): + if 'graph' in self.summaries: + tf.summary.trace_on(graph=True, profiler=False) + self.reset(_initialize=True) + if 'graph' in self.summaries: + tf.summary.trace_export(name='reset', step=self.timesteps, profiler_outdir=None) + tf.summary.trace_on(graph=True, profiler=False) + self.act( + states=self.states_spec, auxiliaries=self.auxiliaries_spec, parallel=self.parallel_spec, + _initialize=True + ) + if 'graph' in self.summaries: + tf.summary.trace_export(name='act', step=self.timesteps, profiler_outdir=None) + tf.summary.trace_on(graph=True, profiler=False) + kwargs = dict(states=self.states_spec) + if len(self.internals_spec) > 0: + kwargs['internals'] = self.internals_spec + if len(self.auxiliaries_spec) > 0: + kwargs['auxiliaries'] = self.auxiliaries_spec + kwargs['deterministic'] = self.deterministic_spec + self.independent_act(**kwargs, _initialize=True) + if 'graph' in self.summaries: + tf.summary.trace_export( + name='independent-act', step=self.timesteps, profiler_outdir=None + ) + tf.summary.trace_on(graph=True, profiler=False) + self.observe( + terminal=self.terminal_spec, reward=self.reward_spec, parallel=self.parallel_spec, + _initialize=True + ) + if 'graph' in self.summaries: + tf.summary.trace_export(name='observe', step=self.timesteps, profiler_outdir=None) + + def get_savedmodel_trackables(self): + return dict() + + def input_signature(self, *, function): + if function == 'act': + return SignatureDict( + states=self.states_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + parallel=self.parallel_spec.signature(batched=True) + ) + + elif function == 'core_act': + return SignatureDict( + states=self.states_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + parallel=self.parallel_spec.signature(batched=True), + deterministic=self.deterministic_spec.signature(batched=False) + ) + + elif function == 'core_observe': + return SignatureDict( + terminal=self.terminal_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + parallel=self.parallel_spec.signature(batched=False) + ) + + elif function == 'independent_act': + signature = SignatureDict(states=self.states_spec.signature(batched=True)) + if len(self.internals_spec) > 0: + signature['internals'] = self.internals_spec.signature(batched=True) + if len(self.auxiliaries_spec) > 0: + signature['auxiliaries'] = self.auxiliaries_spec.signature(batched=True) + signature['deterministic'] = self.deterministic_spec.signature(batched=False) + return signature + + elif function == 'observe': + return SignatureDict( + terminal=self.terminal_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + parallel=self.parallel_spec.signature(batched=False) + ) + + elif function == 'reset': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'act': + return SignatureDict( + actions=self.actions_spec.signature(batched=True), + timesteps=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + if function == 'core_act': + return SignatureDict( + actions=self.actions_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'core_observe': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'independent_act': + if len(self.internals_spec) > 0: + return SignatureDict( + actions=self.actions_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + else: + return SignatureDict(singleton=self.actions_spec.signature(batched=True)) + + elif function == 'observe': + return SignatureDict( + updated=TensorSpec(type='bool', shape=()).signature(batched=False), + episodes=TensorSpec(type='int', shape=()).signature(batched=False), + updates=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + elif function == 'reset': + return SignatureDict( + timesteps=TensorSpec(type='int', shape=()).signature(batched=False), + episodes=TensorSpec(type='int', shape=()).signature(batched=False), + updates=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=0, api_function=True) + def reset(self): + timestep = tf_util.identity(input=self.timesteps) + episode = tf_util.identity(input=self.episodes) + update = tf_util.identity(input=self.updates) + return timestep, episode, update + + @tf_function(num_args=4, optional=2, api_function=True, dict_interface=True) + def independent_act(self, *, states, internals=None, auxiliaries=None, deterministic=None): + if internals is None: + assert len(self.internals_spec) == 0 + internals = TensorDict() + if auxiliaries is None: + assert len(self.auxiliaries_spec) == 0 + auxiliaries = TensorDict() + assert deterministic is not None + batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int') + + # Input assertions + assertions = list() + if self.config.create_tf_assertions: + true = tf_util.constant(value=True, dtype='bool') + assertions.extend(self.states_spec.tf_assert( + x=states, batch_size=batch_size, + message='Agent.independent_act: invalid {issue} for {name} state input.' + )) + assertions.extend(self.internals_spec.tf_assert( + x=internals, batch_size=batch_size, + message='Agent.independent_act: invalid {issue} for {name} internal input.' + )) + assertions.extend(self.auxiliaries_spec.tf_assert( + x=auxiliaries, batch_size=batch_size, + message='Agent.independent_act: invalid {issue} for {name} input.' + )) + assertions.extend(self.deterministic_spec.tf_assert( + x=deterministic, + message='Agent.independent_act: invalid {issue} for deterministic input.' + )) + # Mask assertions + if self.config.enable_int_action_masking: + for name, spec in self.actions_spec.items(): + if spec.type == 'int': + assertions.append(tf.debugging.assert_equal( + x=tf.reduce_all(input_tensor=tf.math.reduce_any( + input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1) + )), y=true, + message="Agent.independent_act: at least one action has to be valid." + )) + + with tf.control_dependencies(control_inputs=assertions): + # Core act + parallel = tf_util.zeros(shape=tf.expand_dims(batch_size, axis=0), dtype='int') + actions, internals = self.core_act( + states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, + deterministic=deterministic, independent=True + ) + # Skip action assertions + + if len(self.internals_spec) > 0: + return actions, internals + else: + return actions + + @tf_function(num_args=3, api_function=True) + def act(self, *, states, auxiliaries, parallel): + batch_size = tf_util.cast(x=tf.shape(input=parallel)[0], dtype='int') + + # Input assertions + assertions = list() + if self.config.create_tf_assertions: + assertions.extend(self.states_spec.tf_assert( + x=states, batch_size=batch_size, + message='Agent.act: invalid {issue} for {name} state input.' + )) + assertions.extend(self.auxiliaries_spec.tf_assert( + x=auxiliaries, batch_size=batch_size, + message='Agent.act: invalid {issue} for {name} input.' + )) + assertions.extend(self.parallel_spec.tf_assert( + x=parallel, batch_size=batch_size, + message='Agent.act: invalid {issue} for parallel input.' + )) + # Mask assertions + if self.config.enable_int_action_masking: + true = tf_util.constant(value=True, dtype='bool') + for name, spec in self.actions_spec.items(): + if spec.type == 'int': + assertions.append(tf.debugging.assert_equal( + x=tf.reduce_all(input_tensor=tf.math.reduce_any( + input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1) + )), y=true, + message="Agent.independent_act: at least one action has to be valid." + )) + + with tf.control_dependencies(control_inputs=assertions): + # Retrieve internals + internals = self.previous_internals.fmap( + function=(lambda x: tf.gather(params=x, indices=parallel)), cls=TensorDict + ) + + # Core act + deterministic = tf_util.constant(value=False, dtype='bool') + actions, internals = self.core_act( + states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, + deterministic=deterministic, independent=False + ) + + # Action assertions + assertions = list() + if self.config.create_tf_assertions: + assertions.extend(self.actions_spec.tf_assert(x=actions, batch_size=batch_size)) + if self.config.enable_int_action_masking: + for name, spec, action in self.actions_spec.zip_items(actions): + if spec.type == 'int': + is_valid = tf.reduce_all(input_tensor=tf.gather( + params=auxiliaries[name]['mask'], + indices=tf.expand_dims(input=action, axis=(spec.rank + 1)), + batch_dims=(spec.rank + 1) + )) + assertions.append(tf.debugging.assert_equal( + x=is_valid, y=true, message="Action mask check." + )) + + # Remember internals + dependencies = list() + indices = tf.expand_dims(input=parallel, axis=1) + for name, previous, internal in self.previous_internals.zip_items(internals): + value = tf.tensor_scatter_nd_update(tensor=previous, indices=indices, updates=internal) + dependencies.append(previous.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=internal, indices=parallel) + # dependencies.append(previous.scatter_update(sparse_delta=sparse_delta)) + + # Increment timestep (after core act) + with tf.control_dependencies(control_inputs=(actions.flatten() + internals.flatten())): + dependencies.append(self.timesteps.assign_add(delta=batch_size, read_value=False)) + + with tf.control_dependencies(control_inputs=(dependencies + assertions)): + actions = actions.fmap(function=tf_util.identity) + timestep = tf_util.identity(input=self.timesteps) + return actions, timestep + + @tf_function(num_args=3, api_function=True) + def observe(self, *, terminal, reward, parallel): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') + expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) + is_terminal = tf.math.greater(x=terminal[-1], y=zero) + + # Input assertions + assertions = list() + if self.config.create_tf_assertions: + assertions.extend(self.terminal_spec.tf_assert( + x=terminal, batch_size=batch_size, + message='Agent.observe: invalid {issue} for terminal input.' + )) + assertions.extend(self.reward_spec.tf_assert( + x=reward, batch_size=batch_size, + message='Agent.observe: invalid {issue} for terminal input.' + )) + assertions.extend(self.parallel_spec.tf_assert( + x=parallel, message='Agent.observe: invalid {issue} for parallel input.' + )) + # Assertion: at most one terminal + num_terms = tf.math.count_nonzero(input=terminal, dtype=tf_util.get_dtype(type='int')) + assertions.append(tf.debugging.assert_less_equal( + x=num_terms, y=one, message="Agent.observe: input contains more than one terminal." + )) + # Assertion: if terminal, last timestep in batch + assertions.append(tf.debugging.assert_equal( + x=tf.math.greater(x=num_terms, y=zero), y=is_terminal, + message="Agent.observe: terminal is not the last input timestep." + )) + + with tf.control_dependencies(control_inputs=assertions): + dependencies = list() + + # Reward summary + if self.summaries == 'all' or 'reward' in self.summaries: + with self.summarizer.as_default(): + x = tf.math.reduce_mean(input_tensor=reward) + dependencies.append( + tf.summary.scalar(name='reward', data=x, step=self.timesteps) + ) + + # Update episode length/reward + updates = tf.expand_dims(input=batch_size, axis=0) + value = tf.tensor_scatter_nd_add( + tensor=self.episode_length, indices=expanded_parallel, updates=updates + ) + dependencies.append(self.episode_length.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=batch_size, indices=parallel) + # dependencies.append(self.episode_length.scatter_add(sparse_delta=sparse_delta)) + sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True) + value = tf.tensor_scatter_nd_add( + tensor=self.episode_return, indices=expanded_parallel, updates=sum_reward + ) + dependencies.append(self.episode_return.assign(value=value)) + # sum_reward = tf.math.reduce_sum(input_tensor=reward) + # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel) + # dependencies.append(self.episode_return.scatter_add(sparse_delta=sparse_delta)) + + # Core observe (before terminal handling) + updated = self.core_observe(terminal=terminal, reward=reward, parallel=parallel) + dependencies.append(updated) + + # Handle terminal (after core observe and episode reward) + with tf.control_dependencies(control_inputs=dependencies): + + def fn_terminal(): + operations = list() + + # Reset internals + def function(spec, initial): + return tf_util.constant(value=initial, dtype=spec.type) + + initials = self.internals_spec.fmap( + function=function, cls=TensorDict, zip_values=self.initial_internals + ) + for name, previous, initial in self.previous_internals.zip_items(initials): + updates = tf.expand_dims(input=initial, axis=0) + value = tf.tensor_scatter_nd_update( + tensor=previous, indices=expanded_parallel, updates=updates + ) + operations.append(previous.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel) + # operations.append(previous.scatter_update(sparse_delta=sparse_delta)) + + # Episode length/reward summaries (before episode reward reset / episodes increment) + dependencies = list() + if self.summaries == 'all' or 'reward' in self.summaries: + with self.summarizer.as_default(): + x = tf.gather(params=self.episode_length, indices=parallel) + dependencies.append( + tf.summary.scalar(name='episode-length', data=x, step=self.episodes) + ) + x = tf.gather(params=self.episode_return, indices=parallel) + dependencies.append( + tf.summary.scalar(name='episode-return', data=x, step=self.episodes) + ) + + # Reset episode length/reward + with tf.control_dependencies(control_inputs=dependencies): + zeros = tf_util.zeros(shape=(1,), dtype='int') + value = tf.tensor_scatter_nd_update( + tensor=self.episode_length, indices=expanded_parallel, updates=zeros + ) + operations.append(self.episode_length.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) + # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta)) + zeros = tf_util.zeros(shape=(1,), dtype='float') + value = tf.tensor_scatter_nd_update( + tensor=self.episode_return, indices=expanded_parallel, updates=zeros + ) + operations.append(self.episode_return.assign(value=value)) + # zero_float = tf_util.constant(value=0.0, dtype='float') + # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) + # operations.append(self.episode_return.scatter_update(sparse_delta=sparse_delta)) + + # Increment episodes counter + operations.append(self.episodes.assign_add(delta=one, read_value=False)) + + return tf.group(*operations) + + handle_terminal = tf.cond(pred=is_terminal, true_fn=fn_terminal, false_fn=tf.no_op) + + with tf.control_dependencies(control_inputs=(handle_terminal,)): + episodes = tf_util.identity(input=self.episodes) + updates = tf_util.identity(input=self.updates) + return updated, episodes, updates + + @tf_function(num_args=5) + def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): + raise NotImplementedError + + @tf_function(num_args=3) + def core_observe(self, *, terminal, reward, parallel): + return tf_util.constant(value=False, dtype='bool') + + def save(self, *, directory=None, filename=None, format='checkpoint', append=None): + if directory is None and filename is None and format == 'checkpoint': + if self.saver is None: + raise TensorforceError.required(name='Model.save', argument='directory') + if append is None: + append = self.saver._step_counter + else: + append = self.units[append] + return self.saver.save(checkpoint_number=append) + + if directory is None: + raise TensorforceError.required(name='Model.save', argument='directory') + + if filename is None: + filename = self.name + + if append is not None: + filename = filename + '-' + str(self.units[append].numpy().item()) + + if format == 'saved-model': + if filename != self.name: + directory = os.path.join(directory, filename) + + assert hasattr(self, '_independent_act_graphs') + assert len(self._independent_act_graphs) == 1 + independent_act = next(iter(self._independent_act_graphs.values())) + + trackables = self.get_savedmodel_trackables() + assert 'act' not in trackables and 'initial_internals' not in trackables + trackables = OrderedDict(sorted(trackables.items(), key=(lambda kv: kv[0]))) + + @tf.function(input_signature=(), autograph=False) + def initial_internals(): + return self.internals_spec.fmap(function=( + lambda spec, internal: tf.constant(value=internal) + ), cls=dict, zip_values=self.initial_internals) + + checkpoint = tf.train.Checkpoint( + act=independent_act, initial_internals=initial_internals, **trackables + ) + + # TensorFlow restriction: "Dictionaries outputs for functions used as signatures should + # have one Tensor output per string key." + if len(self.internals_spec) == 0 and \ + not any(name is not None and '/' in name for name in self.actions_spec): + signatures = independent_act.get_concrete_function( + *self.input_signature(function='independent_act').to_list(to_dict=True) + ) + else: + signatures = None + + return tf.saved_model.save(obj=checkpoint, export_dir=directory, signatures=signatures) + + if format == 'checkpoint': + # which variables are not saved? should all be saved probably, so remove option + # always write temporary terminal=2/3 to indicate it is in process... has been removed recently... + # check everywhere temrinal is checked that this is correct, if 3 is used. + # Reset should reset estimator!!! + if self.checkpoint is None: + self.checkpoint = tf.train.Checkpoint(**{self.name: self}) + + # We are using the high-level "save" method of the checkpoint to write a "checkpoint" file. + # This makes it easily restorable later on. + # The base class uses the lower level "write" method, which doesn't provide such niceties. + return self.checkpoint.save(file_prefix=os.path.join(directory, filename)) + + # elif format == 'tensorflow': + # if self.summarizer_spec is not None: + # self.monitored_session.run(fetches=self.summarizer_flush) + # saver_path = self.saver.save( + # sess=self.session, save_path=path, global_step=append, + # # latest_filename=None, # Defaults to 'checkpoint'. + # meta_graph_suffix='meta', write_meta_graph=True, write_state=True + # ) + # assert saver_path.startswith(path) + # path = saver_path + + # if not no_act_pb: + # graph_def = self.graph.as_graph_def() + + # # freeze_graph clear_devices option + # for node in graph_def.node: + # node.device = '' + + # graph_def = tf.compat.v1.graph_util.remove_training_nodes(input_graph=graph_def) + # output_node_names = [ + # self.name + '.independent_act/' + name + '-output' + # for name in self.output_tensors['independent_act'] + # ] + # # implies tf.compat.v1.graph_util.extract_sub_graph + # graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( + # sess=self.monitored_session, input_graph_def=graph_def, + # output_node_names=output_node_names + # ) + # graph_path = tf.io.write_graph( + # graph_or_graph_def=graph_def, logdir=directory, + # name=(os.path.split(path)[1] + '.pb'), as_text=False + # ) + # assert graph_path == path + '.pb' + # return path + + elif format == 'numpy': + variables = dict() + for variable in self.saved_variables: + assert variable.name[-2] == ':' + if variable.name.startswith(self.name + '/'): + variables[variable.name[len(self.name) + 1: -2]] = variable.numpy() + else: + variables[variable.name[:-2]] = variable.numpy() + path = os.path.join(directory, filename) + '.npz' + np.savez(file=path, **variables) + return path + + elif format == 'hdf5': + path = os.path.join(directory, filename) + '.hdf5' + with h5py.File(name=path, mode='w') as filehandle: + for variable in self.saved_variables: + assert variable.name[-2] == ':' + if variable.name.startswith(self.name + '/'): + filehandle.create_dataset( + name=variable.name[len(self.name) + 1: -2], data=variable.numpy() + ) + else: + filehandle.create_dataset(name=variable.name[:-2], data=variable.numpy()) + return path + + else: + raise TensorforceError.value(name='Model.save', argument='format', value=format) + + def restore(self, *, directory=None, filename=None, format='checkpoint'): + if format == 'checkpoint': + if directory is None: + if self.saver is None: + raise TensorforceError.required(name='Model.save', argument='directory') + directory = self.saver_directory + if filename is None: + path = tf.train.latest_checkpoint(checkpoint_dir=directory) + if not path: + raise TensorforceError.exists_not(name='Checkpoint', value=directory) + _directory, filename = os.path.split(path) + assert _directory == directory + super().restore(directory=directory, filename=filename) + + elif format == 'saved-model': + # TODO: Check memory/estimator/etc variables are not included! + raise TensorforceError.value(name='Model.load', argument='format', value=format) + + # elif format == 'tensorflow': + # self.saver.restore(sess=self.session, save_path=path) + + elif format == 'numpy': + if directory is None: + raise TensorforceError( + name='Model.load', argument='directory', condition='format is "numpy"' + ) + if filename is None: + raise TensorforceError( + name='Model.load', argument='filename', condition='format is "numpy"' + ) + variables = np.load(file=(os.path.join(directory, filename) + '.npz')) + for variable in self.saved_variables: + assert variable.name[-2] == ':' + if variable.name.startswith(self.name + '/'): + variable.assign(value=variables[variable.name[len(self.name) + 1: -2]]) + else: + variable.assign(value=variables[variable.name[:-2]]) + + elif format == 'hdf5': + if directory is None: + raise TensorforceError( + name='Model.load', argument='directory', condition='format is "hdf5"' + ) + if filename is None: + raise TensorforceError( + name='Model.load', argument='filename', condition='format is "hdf5"' + ) + path = os.path.join(directory, filename) + if os.path.isfile(path + '.hdf5'): + path = path + '.hdf5' + else: + path = path + '.h5' + with h5py.File(name=path, mode='r') as filehandle: + for variable in self.saved_variables: + assert variable.name[-2] == ':' + if variable.name.startswith(self.name + '/'): + variable.assign(value=filehandle[variable.name[len(self.name) + 1: -2]]) + else: + variable.assign(value=filehandle[variable.name[:-2]]) + + else: + raise TensorforceError.value(name='Model.load', argument='format', value=format) + + timesteps, episodes, updates = self.reset() + return timesteps.numpy().item(), episodes.numpy().item(), updates.numpy().item() diff --git a/tensorforce/core/models/random.py b/tensorforce/core/models/random.py new file mode 100644 index 000000000..5479b4844 --- /dev/null +++ b/tensorforce/core/models/random.py @@ -0,0 +1,96 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import TensorDict, tf_function, tf_util +from tensorforce.core.models import Model + + +class RandomModel(Model): + """ + Utility class to return random actions of a desired shape and with given bounds. + """ + + def __init__(self, *, states, actions, parallel_interactions, config, summarizer, tracking): + super().__init__( + states=states, actions=actions, l2_regularization=0.0, + parallel_interactions=parallel_interactions, config=config, saver=None, + summarizer=summarizer, tracking=tracking + ) + + @tf_function(num_args=5) + def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): + assert len(internals) == 0 + + actions = TensorDict() + x = tf.shape(input=states.value())[:1] + for name, spec in self.actions_spec.items(): + shape = tf.concat(values=( + tf_util.cast(x=x, dtype='int'), + tf_util.constant(value=spec.shape, dtype='int') + ), axis=0) + + if spec.type == 'bool': + # Random bool action: uniform[True, False] + half = tf_util.constant(value=0.5, dtype='float') + uniform = tf.random.uniform(shape=shape, dtype=tf_util.get_dtype(type='float')) + actions[name] = (uniform < half) + + elif self.config.enable_int_action_masking and spec.type == 'int' and \ + spec.num_values is not None: + # Random masked action: uniform[unmasked] + # (Similar code as for Model.apply_exploration) + mask = auxiliaries[name]['mask'] + choices = tf_util.constant( + value=list(range(spec.num_values)), dtype=spec.type, + shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) + ) + one = tf_util.constant(value=1, dtype='int', shape=(1,)) + multiples = tf.concat(values=(shape, one), axis=0) + choices = tf.tile(input=choices, multiples=multiples) + choices = tf.boolean_mask(tensor=choices, mask=mask) + mask = tf_util.cast(x=mask, dtype='int') + num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) + num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) + masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) + uniform = tf.random.uniform(shape=shape, dtype=tf_util.get_dtype(type='float')) + uniform = tf.reshape(tensor=uniform, shape=(-1,)) + num_valid = tf_util.cast(x=num_valid, dtype='float') + random_offset = tf.dtypes.cast(x=(uniform * num_valid), dtype=tf.dtypes.int64) + action = tf.gather(params=choices, indices=(masked_offset + random_offset)) + actions[name] = tf.reshape(tensor=action, shape=shape) + + elif spec.type != 'bool' and spec.min_value is not None: + if spec.max_value is not None: + # Random bounded action: uniform[min_value, max_value] + actions[name] = tf.random.uniform( + shape=shape, minval=spec.min_value, maxval=spec.max_value, + dtype=spec.tf_type() + ) + + else: + # Random left-bounded action: not implemented + raise NotImplementedError + + elif spec.type != 'bool' and spec.max_value is not None: + # Random right-bounded action: not implemented + raise NotImplementedError + + else: + # Random unbounded int/float action + actions[name] = tf.random.normal(shape=shape, dtype=spec.tf_type()) + + return actions, TensorDict() diff --git a/tensorforce/core/models/tensorforce.py b/tensorforce/core/models/tensorforce.py new file mode 100644 index 000000000..3be50029e --- /dev/null +++ b/tensorforce/core/models/tensorforce.py @@ -0,0 +1,2963 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import logging + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import ModuleDict, memory_modules, optimizer_modules, parameter_modules, \ + SignatureDict, TensorDict, TensorSpec, TensorsSpec, tf_function, tf_util, VariableDict +from tensorforce.core.models import Model +from tensorforce.core.networks import Preprocessor +from tensorforce.core.objectives import objective_modules +from tensorforce.core.policies import policy_modules, StochasticPolicy + + +class TensorforceModel(Model): + + def __init__( + self, *, + states, actions, max_episode_timesteps, + policy, memory, update, optimizer, objective, reward_estimation, + baseline, baseline_optimizer, baseline_objective, + l2_regularization, entropy_regularization, + state_preprocessing, + exploration, variable_noise, + parallel_interactions, + config, saver, summarizer, tracking + ): + super().__init__( + states=states, actions=actions, l2_regularization=l2_regularization, + parallel_interactions=parallel_interactions, config=config, saver=saver, + summarizer=summarizer, tracking=tracking + ) + + if max_episode_timesteps is None: + self.max_episode_timesteps = None + else: + self.max_episode_timesteps = int(max_episode_timesteps) + + # State preprocessing + self.processed_states_spec = TensorsSpec() + self.state_preprocessing = ModuleDict() + if state_preprocessing == 'linear_normalization': + # Default handling, otherwise layer will be applied to all input types + state_preprocessing = { + name: ['linear_normalization'] for name, spec in self.states_spec.items() + if spec.type == 'float' and spec.min_value is not None and + spec.max_value is not None + } + if not isinstance(state_preprocessing, dict) or \ + any(name not in self.states_spec for name in state_preprocessing): + state_preprocessing = {name: state_preprocessing for name in self.states_spec} + + for name, spec in self.states_spec.items(): + if name in state_preprocessing: + layers = state_preprocessing[name] + elif spec.type in state_preprocessing: + layers = state_preprocessing[spec.type] + else: + layers = None + + if layers is None: + self.processed_states_spec[name] = self.states_spec[name] + else: + if name is None: + module_name = 'state_preprocessing' + else: + module_name = name + '_preprocessing' + self.state_preprocessing[name] = self.submodule( + name=module_name, module=Preprocessor, is_trainable=False, input_spec=spec, + layers=layers + ) + spec = self.state_preprocessing[name].output_spec() + self.processed_states_spec[name] = spec + + if spec.type == 'float' and spec.min_value is not None and \ + spec.max_value is not None: + if isinstance(spec.min_value, float): + if not (-10.0 <= spec.min_value < 0.0) or not (0.0 < spec.max_value <= 10.0): + logging.warning("{}tate{} does not seem to be normalized, consider " + "adding linear_normalization preprocessing.".format( + 'S' if layers is None else 'Preprocessed s', + '' if name is None else ' ' + name + )) + else: + # TODO: missing +/-10.0 check, but cases of values +/-inf are already covered by + # previous no-bound warning + if (spec.min_value >= 0.0).any() or (spec.max_value <= 0.0).any(): + logging.warning("{}tate{} does not seem to be normalized, consider " + "adding linear_normalization preprocessing.".format( + 'S' if layers is None else 'Preprocessed s', + '' if name is None else ' ' + name + )) + + # Action exploration + if exploration is None: + exploration = 0.0 + if isinstance(exploration, dict) and all(name in self.actions_spec for name in exploration): + # Different exploration per action + self.exploration = ModuleDict() + for name, spec in self.actions_spec.items(): + if name in exploration: + module = exploration[name] + elif spec.type in exploration: + module = exploration[spec.type] + else: + module = None + if module is None: + pass + elif spec.type in ('bool', 'int'): + self.exploration[name] = self.submodule( + name=(name + '_exploration'), module=module, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0, max_value=1.0 + ) + else: + self.exploration[name] = self.submodule( + name=(name + '_exploration'), module=module, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0 + ) + else: + # Same exploration for all actions + self.exploration = self.submodule( + name='exploration', module=exploration, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0 + ) + + # Variable noise + if variable_noise is None: + variable_noise = 0.0 + self.variable_noise = self.submodule( + name='variable_noise', module=variable_noise, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0 + ) + + # Reward estimation argument check + if not all(key in ( + 'advantage_processing', 'discount', 'estimate_advantage', 'gae_decay', 'horizon', + 'predict_action_values', 'predict_horizon_values', 'predict_terminal_values', + 'return_processing', 'reward_processing', 'trace_decay' + ) for key in reward_estimation): + raise TensorforceError.value( + name='agent', argument='reward_estimation', value=reward_estimation, + hint='not from {advantage_processing,discount,estimate_advantage,gae_decay,' + 'horizon,predict_action_values,predict_horizon_values,predict_terminal_values,' + 'return_processing,reward_processing,trace_decay}' + ) + + # Reward estimation + self.estimate_advantage = reward_estimation.get('estimate_advantage', False) + self.predict_horizon_values = reward_estimation.get('predict_horizon_values') + self.predict_action_values = reward_estimation.get('predict_action_values', False) + self.predict_terminal_values = reward_estimation.get('predict_terminal_values', False) + + # Return horizon + if reward_estimation['horizon'] == 'episode': + self.reward_horizon = 'episode' + if self.predict_horizon_values is None: + self.predict_horizon_values = 'early' + elif self.predict_horizon_values == 'late': + raise TensorforceError.value( + name='agent', argument='reward_estimation[predict_horizon_values]', + value=self.predict_horizon_values, + condition='reward_estimation[reward_horizon] is "episode"' + ) + else: + self.reward_horizon = self.submodule( + name='reward_horizon', module=reward_estimation['horizon'], + modules=parameter_modules, dtype='int', min_value=1, + max_value=self.max_episode_timesteps + ) + if self.predict_horizon_values is None: + self.predict_horizon_values = 'late' + + # Reward discount + reward_discount = reward_estimation.get('discount') + if reward_discount is None: + reward_discount = 1.0 + self.reward_discount = self.submodule( + name='reward_discount', module=reward_discount, modules=parameter_modules, + dtype='float', min_value=0.0, max_value=1.0 + ) + + # Entropy regularization + if entropy_regularization is None: + entropy_regularization = 0.0 + self.entropy_regularization = self.submodule( + name='entropy_regularization', module=entropy_regularization, + modules=parameter_modules, is_trainable=False, dtype='float', min_value=0.0 + ) + + # Update mode + if not all(key in ('batch_size', 'frequency', 'start', 'unit') for key in update): + raise TensorforceError.value( + name='agent', argument='update', value=list(update), + hint='not from {batch_size,frequency,start,unit}' + ) + # update: unit + elif 'unit' not in update: + raise TensorforceError.required(name='agent', argument='update[unit]') + elif update['unit'] not in ('timesteps', 'episodes'): + raise TensorforceError.value( + name='agent', argument='update[unit]', value=update['unit'], + hint='not in {timesteps,episodes}' + ) + # update: batch_size + elif 'batch_size' not in update: + raise TensorforceError.required(name='agent', argument='update[batch_size]') + + self.update_unit = update['unit'] + self.update_batch_size = self.submodule( + name='update_batch_size', module=update['batch_size'], modules=parameter_modules, + is_trainable=False, dtype='int', min_value=1 + ) + if 'frequency' in update and update['frequency'] == 'never': + self.update_frequency = None + else: + frequency = update.get('frequency') + if frequency is None: + frequency = update['batch_size'] + elif isinstance(frequency, float): + if frequency <= 0.0 or frequency > 1.0: + raise TensorforceError.value( + name='agent', argument='update[frequency]', value=update['frequency'], + hint='not in (0.0, 1.0]' + ) + else: + frequency = max(1, int(frequency * update['batch_size'])) + self.update_frequency = self.submodule( + name='update_frequency', module=frequency, modules=parameter_modules, + is_trainable=False, dtype='int', min_value=1, + max_value=max(2, self.update_batch_size.max_value()) + ) + start = update.get('start') + if start is None: + start = 0 + self.update_start = self.submodule( + name='update_start', module=start, modules=parameter_modules, is_trainable=False, + dtype='int', min_value=0 + ) + + # Baseline optimization overview: + # Policy Objective Optimizer Config + # n n n default predict_horizon_values=False + # n n f default predict_horizon=False + # n n y default predict_horizon=False + # n y n main policy, shared loss/kldiv, weighted 1.0 + # n y f main policy, shared loss/kldiv, weighted + # n y y main policy, separate + # y n n estimate_advantage=True,advantage_in_loss=True + # y n f shared objective/loss/kldiv, weighted + # y n y shared objective + # y y n shared loss/kldiv, weighted 1.0, equal horizon + # y y f shared loss/kldiv, weighted, equal horizon + # y y y separate + + self.separate_baseline = (baseline is not None) + + if baseline is None and baseline_objective is None and \ + 'predict_horizon_values' not in reward_estimation: + self.predict_horizon_values = False + + if baseline is not None and baseline_objective is None and \ + baseline_optimizer is None: + if 'estimate_advantage' not in reward_estimation: + self.estimate_advantage = 'late' + self.advantage_in_loss = True + else: + self.advantage_in_loss = False + + if baseline_optimizer is None and baseline_objective is not None: + baseline_optimizer = 1.0 + + if baseline_optimizer is None or isinstance(baseline_optimizer, float): + baseline_is_trainable = True + else: + baseline_is_trainable = False + + # Reward processing + reward_processing = reward_estimation.get('reward_processing') + if reward_processing is None: + self.reward_processing = None + else: + self.reward_processing = self.submodule( + name='reward_processing', module=Preprocessor, is_trainable=False, + input_spec=self.reward_spec, layers=reward_processing + ) + if self.reward_processing.output_spec() != self.reward_spec: + raise TensorforceError.mismatch( + name='reward_estimation[reward_processing]', argument='output spec', + value1=self.reward_processing.output_spec(), value2=self.reward_spec + ) + + # Return processing + return_processing = reward_estimation.get('return_processing') + if return_processing is None: + self.return_processing = None + else: + self.return_processing = self.submodule( + name='return_processing', module=Preprocessor, is_trainable=False, + input_spec=self.reward_spec, layers=return_processing, + is_preprocessing_layer_valid=False + ) + if self.return_processing.output_spec() != self.reward_spec: + raise TensorforceError.mismatch( + name='reward_estimation[return_processing]', argument='output spec', + value1=self.return_processing.output_spec(), value2=self.reward_spec + ) + + # Advantage processing + advantage_processing = reward_estimation.get('advantage_processing') + if advantage_processing is None: + self.advantage_processing = None + else: + if self.estimate_advantage is False: + raise TensorforceError.invalid( + name='agent', argument='reward_estimation[advantage_processing]', + condition='reward_estimation[estimate_advantage] is false' + ) + self.advantage_processing = self.submodule( + name='advantage_processing', module=Preprocessor, is_trainable=False, + input_spec=self.reward_spec, layers=advantage_processing, + is_preprocessing_layer_valid=False + ) + if self.advantage_processing.output_spec() != self.reward_spec: + raise TensorforceError.mismatch( + name='reward_estimation[advantage_processing]', argument='output spec', + value1=self.advantage_processing.output_spec(), value2=self.reward_spec + ) + + # Objectives + self.objective = self.submodule( + name='policy_objective', module=objective, modules=objective_modules, + states_spec=self.processed_states_spec, auxiliaries_spec=self.auxiliaries_spec, + actions_spec=self.actions_spec, reward_spec=self.reward_spec + ) + if baseline_objective is None: + self.baseline_objective = None + else: + self.baseline_objective = self.submodule( + name='baseline_objective', module=baseline_objective, modules=objective_modules, + is_trainable=baseline_is_trainable, states_spec=self.processed_states_spec, + auxiliaries_spec=self.auxiliaries_spec, actions_spec=self.actions_spec, + reward_spec=self.reward_spec + ) + assert len(self.baseline_objective.required_baseline_fns()) == 0 + + # Policy + required_fns = {'policy'} + required_fns.update(self.objective.required_policy_fns()) + if not self.separate_baseline: + if self.predict_horizon_values is not False or self.estimate_advantage is not False: + if self.predict_action_values: + required_fns.add('action_value') + else: + required_fns.add('state_value') + required_fns.update(self.objective.required_baseline_fns()) + if self.baseline_objective is not None: + required_fns.update(self.baseline_objective.required_policy_fns()) + + if required_fns <= {'state_value'}: + default_module = 'parametrized_state_value' + elif required_fns <= {'action_value'} and \ + all(spec.type == 'float' for spec in self.actions_spec.values()): + default_module = 'parametrized_action_value' + elif required_fns <= {'policy', 'action_value', 'state_value'} and \ + all(spec.type in ('bool', 'int') for spec in self.actions_spec.values()): + default_module = 'parametrized_value_policy' + elif required_fns <= {'policy', 'stochastic'}: + default_module = 'parametrized_distributions' + else: + logging.warning( + "Policy type should be explicitly specified for non-standard agent configuration." + ) + default_module = 'parametrized_distributions' + + self.policy = self.submodule( + name='policy', module=policy, modules=policy_modules, default_module=default_module, + states_spec=self.processed_states_spec, auxiliaries_spec=self.auxiliaries_spec, + actions_spec=self.actions_spec + ) + self.internals_spec['policy'] = self.policy.internals_spec + self.initial_internals['policy'] = self.policy.internals_init() + self.objective.internals_spec = self.policy.internals_spec + + if not self.entropy_regularization.is_constant(value=0.0) and \ + not isinstance(self.policy, StochasticPolicy): + raise TensorforceError.invalid( + name='agent', argument='entropy_regularization', + condition='policy is not stochastic' + ) + + # Baseline + if self.separate_baseline: + if self.predict_horizon_values is not False or self.estimate_advantage is not False: + if self.predict_action_values: + required_fns = {'action_value'} + else: + required_fns = {'state_value'} + required_fns.update(self.objective.required_baseline_fns()) + if self.baseline_objective is not None: + required_fns.update(self.baseline_objective.required_policy_fns()) + + if required_fns <= {'state_value'}: + default_module = 'parametrized_state_value' + elif required_fns <= {'action_value'} and \ + all(spec.type == 'float' for spec in self.actions_spec.values()): + default_module = 'parametrized_action_value' + elif required_fns <= {'policy', 'action_value', 'state_value'} and \ + all(spec.type in ('bool', 'int') for spec in self.actions_spec.values()): + default_module = 'parametrized_value_policy' + elif required_fns <= {'policy', 'stochastic'}: + default_module = 'parametrized_distributions' + else: + logging.warning("Policy type should be explicitly specified for non-standard agent " + "configuration.") + default_module = 'parametrized_distributions' + + self.baseline = self.submodule( + name='baseline', module=baseline, modules=policy_modules, + default_module=default_module, is_trainable=baseline_is_trainable, + states_spec=self.processed_states_spec, auxiliaries_spec=self.auxiliaries_spec, + actions_spec=self.actions_spec + ) + self.internals_spec['baseline'] = self.baseline.internals_spec + self.initial_internals['baseline'] = self.baseline.internals_init() + + else: + self.baseline = self.policy + + if self.baseline_objective is not None: + self.baseline_objective.internals_spec = self.baseline.internals_spec + + # Check for name collisions + for name in self.internals_spec: + if name in self.value_names: + raise TensorforceError.exists(name='value name', value=name) + self.value_names.add(name) + + # Optimizers + if baseline_optimizer is None: + self.baseline_loss_weight = None + internals_spec = self.internals_spec + self.baseline_optimizer = None + elif isinstance(baseline_optimizer, float): + self.baseline_loss_weight = self.submodule( + name='baseline_loss_weight', module=baseline_optimizer, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0 + ) + internals_spec = self.internals_spec + self.baseline_optimizer = None + else: + self.baseline_loss_weight = None + internals_spec = self.internals_spec['policy'] + if self.separate_baseline: + baseline_internals = self.internals_spec['baseline'] + else: + baseline_internals = self.internals_spec['policy'] + arguments_spec = TensorsSpec( + states=self.processed_states_spec, horizons=TensorSpec(type='int', shape=(2,)), + internals=baseline_internals, auxiliaries=self.auxiliaries_spec, + actions=self.actions_spec, reward=self.reward_spec + ) + if self.baseline_objective is not None: + arguments_spec['reference'] = self.baseline_objective.reference_spec() + self.baseline_optimizer = self.submodule( + name='baseline_optimizer', module=baseline_optimizer, modules=optimizer_modules, + is_trainable=False, arguments_spec=arguments_spec + ) + arguments_spec = TensorsSpec( + states=self.processed_states_spec, horizons=TensorSpec(type='int', shape=(2,)), + internals=internals_spec, auxiliaries=self.auxiliaries_spec, actions=self.actions_spec, + reward=self.reward_spec + ) + if self.baseline_objective is not None and self.baseline_loss_weight is not None and \ + not self.baseline_loss_weight.is_constant(value=0.0): + arguments_spec['reference'] = TensorsSpec( + policy=self.objective.reference_spec(), + baseline=self.baseline_objective.reference_spec() + ) + else: + arguments_spec['reference'] = self.objective.reference_spec() + self.optimizer = self.submodule( + name='policy_optimizer', module=optimizer, modules=optimizer_modules, + arguments_spec=arguments_spec + ) + + # Memory + values_spec = TensorsSpec( + states=self.processed_states_spec, internals=self.internals_spec, + auxiliaries=self.auxiliaries_spec, actions=self.actions_spec, + terminal=self.terminal_spec, reward=self.reward_spec + ) + if self.update_unit == 'timesteps': + max_past_horizon = max( + self.policy.max_past_horizon(on_policy=False), + self.baseline.max_past_horizon(on_policy=False) + ) + min_capacity = self.update_batch_size.max_value() + 1 + max_past_horizon + if self.reward_horizon == 'episode': + min_capacity += self.max_episode_timesteps + else: + min_capacity += self.reward_horizon.max_value() + if self.max_episode_timesteps is not None: + min_capacity = max(min_capacity, self.max_episode_timesteps) + elif self.update_unit == 'episodes': + if self.max_episode_timesteps is None: + min_capacity = None + else: + min_capacity = (self.update_batch_size.max_value() + 1) * self.max_episode_timesteps + else: + assert False + if self.config.buffer_observe == 'episode': + if self.max_episode_timesteps is not None: + min_capacity = max(min_capacity, 2 * self.max_episode_timesteps) + elif isinstance(self.config.buffer_observe, int): + if min_capacity is None: + min_capacity = 2 * self.config.buffer_observe + else: + min_capacity = max(min_capacity, 2 * self.config.buffer_observe) + + self.memory = self.submodule( + name='memory', module=memory, modules=memory_modules, is_trainable=False, + values_spec=values_spec, min_capacity=min_capacity + ) + + # Trace decay + trace_decay = reward_estimation.get('trace_decay', 1.0) + if trace_decay != 1.0 and self.predict_horizon_values != 'early': + raise TensorforceError.invalid( + name='agent', argument='reward_estimation[trace_decay]', + condition='reward_estimation[predict_horizon_values] != "early"' + ) + self.trace_decay = self.submodule( + name='trace_decay', module=trace_decay, modules=parameter_modules, dtype='float', + min_value=0.0, max_value=1.0 + ) + + # GAE decay + gae_decay = reward_estimation.get('gae_decay', 0.0) + if gae_decay != 0.0: + from tensorforce.core.memories import Recent + if not isinstance(self.memory, Recent): + raise TensorforceError.invalid( + name='agent', argument='reward_estimation[gae_decay]', + condition='memory type is not Recent' + ) + elif self.estimate_advantage is False: + raise TensorforceError.invalid( + name='agent', argument='reward_estimation[gae_decay]', + condition='reward_estimation[estimate_advantage] is false' + ) + elif self.advantage_in_loss: + raise TensorforceError.invalid( + name='agent', argument='reward_estimation[gae_decay]', + condition='advantage-in-loss mode' + ) + self.gae_decay = self.submodule( + name='gae_decay', module=gae_decay, modules=parameter_modules, dtype='float', + min_value=0.0, max_value=1.0 + ) + + def get_architecture(self): + if self.state_preprocessing.is_singleton(): + architecture = 'State-preprocessing: {}\n'.format( + self.state_preprocessing.singleton().get_architecture().replace('\n', '\n ') + ) + else: + architecture = '' + for name, preprocessor in self.state_preprocessing.items(): + architecture += ' {}: {}\n'.format( + name, preprocessor.get_architecture().replace('\n', '\n ') + ) + if len(architecture) > 0: + architecture = 'State-preprocessing:\n' + architecture + architecture = 'Policy:\n {}'.format( + self.policy.get_architecture().replace('\n', '\n ') + ) + if self.separate_baseline: + architecture += '\nBaseline:\n {}'.format( + self.baseline.get_architecture().replace('\n', '\n ') + ) + elif self.predict_horizon_values or self.baseline_objective is not None: + architecture += '\nBaseline: policy used as baseline' + return architecture + + def initialize(self): + super().initialize() + + # Initial variables summaries + if self.summaries == 'all' or 'variables' in self.summaries: + with self.summarizer.as_default(): + for variable in self.trainable_variables: + name = variable.name + assert name[-2] == ':' + if name.startswith(self.name + '/'): + # Add prefix self.name since otherwise different scope from later summaries + name = self.name + '/variables/' + name[len(self.name) + 1: -2] + else: + name = name[:-2] + x = tf.math.reduce_mean(input_tensor=variable) + tf.summary.scalar(name=name, data=x, step=self.updates) + + def core_initialize(self): + super().core_initialize() + + # Preprocessed episode reward + if self.reward_processing is not None: + self.preprocessed_episode_return = self.variable( + name='preprocessed-episode-return', + spec=TensorSpec(type=self.reward_spec.type, shape=(self.parallel_interactions,)), + initializer='zeros', is_trainable=False, is_saved=False + ) + + # Buffer index + self.buffer_index = self.variable( + name='buffer-index', spec=TensorSpec(type='int', shape=(self.parallel_interactions,)), + initializer='zeros', is_trainable=False, is_saved=False + ) + + if self.reward_horizon == 'episode' or self.parallel_interactions > 1 or \ + self.config.buffer_observe == 'episode': + capacity = self.max_episode_timesteps + else: + capacity = self.config.buffer_observe + self.reward_horizon.max_value() + if self.max_episode_timesteps is not None: + capacity = min(capacity, self.max_episode_timesteps) + + # States/internals/auxiliaries/actions buffers + def function(name, spec): + shape = (self.parallel_interactions, capacity) + spec.shape + return self.variable( + name=(name + '-buffer'), spec=TensorSpec(type=spec.type, shape=shape), + initializer='zeros', is_trainable=False, is_saved=False + ) + + self.states_buffer = self.processed_states_spec.fmap( + function=function, cls=VariableDict, with_names='states' + ) + self.internals_buffer = self.internals_spec.fmap( + function=function, cls=VariableDict, with_names=True + ) + self.auxiliaries_buffer = self.auxiliaries_spec.fmap( + function=function, cls=VariableDict, with_names='action' + ) + self.actions_buffer = self.actions_spec.fmap( + function=function, cls=VariableDict, with_names='actions' + ) + + # Terminal/reward buffer + if self.config.buffer_observe != 'episode': + self.terminal_buffer = function('terminal', self.terminal_spec) + self.reward_buffer = function('reward', self.reward_spec) + + # Buffer start + if self.reward_horizon != 'episode' and self.parallel_interactions == 1 and \ + self.config.buffer_observe != 'episode': + self.circular_buffer = True + self.buffer_capacity = capacity + self.buffer_start = self.variable( + name='buffer-start', + spec=TensorSpec(type='int', shape=(self.parallel_interactions,)), + initializer='zeros', is_trainable=False, is_saved=False + ) + else: + self.circular_buffer = False + + # Last update + self.last_update = self.variable( + name='last-update', spec=TensorSpec(type='int'), + initializer=-self.update_frequency.max_value(), is_trainable=False, is_saved=True + ) + + # Optimizer initialize given variables + if self.advantage_in_loss: + self.optimizer.initialize_given_variables(variables=self.trainable_variables) + else: + self.optimizer.initialize_given_variables(variables=self.policy.trainable_variables) + if self.baseline_optimizer is not None: + self.baseline_optimizer.initialize_given_variables( + variables=self.baseline.trainable_variables + ) + + # Summaries and tracking + self.register_summary(label='loss', name='losses/policy-objective-loss') + self.register_tracking( + label='loss', name='policy-objective-loss', spec=TensorSpec(type='float') + ) + self.register_summary(label='loss', name='losses/policy-regularization-loss') + self.register_tracking( + label='loss', name='policy-regularization-loss', spec=TensorSpec(type='float') + ) + self.register_summary(label='loss', name='losses/policy-loss') + self.register_tracking(label='loss', name='policy-loss', spec=TensorSpec(type='float')) + if self.baseline_optimizer is not None or ( + self.baseline_loss_weight is not None and + not self.baseline_loss_weight.is_constant(value=0.0) + ): + self.register_summary(label='loss', name='losses/baseline-loss') + self.register_tracking(label='loss', name='baseline-loss', spec=TensorSpec(type='float')) + if self.separate_baseline: + self.register_summary(label='loss', name='losses/baseline-objective-loss') + self.register_tracking( + label='loss', name='baseline-objective-loss', spec=TensorSpec(type='float') + ) + self.register_summary(label='loss', name='losses/baseline-regularization-loss') + self.register_tracking( + label='loss', name='baseline-regularization-loss', + spec=TensorSpec(type='float') + ) + + if self.reward_processing is not None: + self.register_tracking( + label='reward', name='preprocessed-reward', spec=TensorSpec(type='float') + ) + self.register_tracking( + label='reward', name='preprocessed-episode-return', spec=TensorSpec(type='float') + ) + self.register_tracking(label='reward', name='update-return', spec=TensorSpec(type='float')) + if self.return_processing is not None: + self.register_tracking( + label='reward', name='update-processed-return', spec=TensorSpec(type='float') + ) + if self.estimate_advantage is not False: + self.register_tracking( + label='reward', name='update-advantage', spec=TensorSpec(type='float') + ) + if self.advantage_processing is not None: + self.register_tracking( + label='reward', name='update-processed-advantage', + spec=TensorSpec(type='float') + ) + if not self.gae_decay.is_constant(value=0.0): + self.register_tracking( + label='reward', name='update-gae', spec=TensorSpec(type='float') + ) + + self.register_tracking(label='entropy', name='entropy', spec=TensorSpec(type='float')) + self.register_tracking( + label='kl-divergence', name='kl-divergence', spec=TensorSpec(type='float') + ) + if len(self.actions_spec) > 1: + for name in self.actions_spec: + self.register_tracking( + label='entropy', name=('entropies/' + name), spec=TensorSpec(type='float') + ) + self.register_tracking( + label='kl-divergence', name=('kl-divergences/' + name), + spec=TensorSpec(type='float') + ) + + def initialize_api(self): + super().initialize_api() + + if 'graph' in self.summaries: + tf.summary.trace_on(graph=True, profiler=False) + self.experience( + states=self.states_spec, internals=self.internals_spec, + auxiliaries=self.auxiliaries_spec, actions=self.actions_spec, + terminal=self.terminal_spec, reward=self.reward_spec, _initialize=True + ) + if 'graph' in self.summaries: + tf.summary.trace_export(name='experience', step=self.timesteps, profiler_outdir=None) + tf.summary.trace_on(graph=True, profiler=False) + self.update(_initialize=True) + if 'graph' in self.summaries: + tf.summary.trace_export(name='update', step=self.timesteps, profiler_outdir=None) + + def get_savedmodel_trackables(self): + trackables = super().get_savedmodel_trackables() + for name, trackable in self.policy.get_savedmodel_trackables().items(): + assert name not in trackables + trackables[name] = trackable + if self.separate_baseline and len(self.internals_spec['baseline']) > 0: + for name, trackable in self.baseline.get_savedmodel_trackables().items(): + assert name not in trackables + trackables[name] = trackable + return trackables + + def input_signature(self, *, function): + if function == 'baseline_loss': + if self.separate_baseline: + internals_signature = self.internals_spec['baseline'].signature(batched=True) + else: + internals_signature = self.internals_spec['policy'].signature(batched=True) + if self.advantage_in_loss: + assert False + elif self.baseline_objective is None: + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=internals_signature, + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=self.objective.reference_spec().signature(batched=True) + ) + else: + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=internals_signature, + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=self.baseline_objective.reference_spec().signature(batched=True) + ) + + elif function == 'core_experience': + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + terminal=self.terminal_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True) + ) + + elif function == 'core_update': + return SignatureDict() + + elif function == 'experience': + return SignatureDict( + states=self.states_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + terminal=self.terminal_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True) + ) + + elif function == 'loss': + if self.baseline_objective is not None and self.baseline_loss_weight is not None and \ + not self.baseline_loss_weight.is_constant(value=0.0): + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=SignatureDict( + policy=self.objective.reference_spec().signature(batched=True), + baseline=self.baseline_objective.reference_spec().signature(batched=True) + ) + ) + elif self.baseline_optimizer is None: + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=self.objective.reference_spec().signature(batched=True) + ) + else: + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec['policy'].signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=self.objective.reference_spec().signature(batched=True) + ) + + elif function == 'regularize': + return SignatureDict( + states=self.processed_states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec['policy'].signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + elif function == 'update': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'baseline_loss': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + elif function == 'core_experience': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'core_update': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'experience': + return SignatureDict( + timesteps=TensorSpec(type='int', shape=()).signature(batched=False), + episodes=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + elif function == 'loss': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + elif function == 'update': + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=0, api_function=True) + def reset(self): + operations = list() + zeros = tf_util.zeros(shape=(self.parallel_interactions,), dtype='int') + operations.append(self.buffer_index.assign(value=zeros, read_value=False)) + if self.circular_buffer: + operations.append(self.buffer_start.assign(value=zeros, read_value=False)) + operations.append(self.memory.reset()) + + # TODO: Synchronization optimizer initial sync? + + with tf.control_dependencies(control_inputs=operations): + return super().reset() + + @tf_function(num_args=6, api_function=True) + def experience(self, *, states, internals, auxiliaries, actions, terminal, reward): + true = tf_util.constant(value=True, dtype='bool') + one = tf_util.constant(value=1, dtype='int') + batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') + + # Input assertions + assertions = list() + if self.config.create_tf_assertions: + zero = tf_util.constant(value=0, dtype='int') + assertions.extend(self.states_spec.tf_assert( + x=states, batch_size=batch_size, + message='Agent.experience: invalid {issue} for {name} state input.' + )) + assertions.extend(self.internals_spec.tf_assert( + x=internals, batch_size=batch_size, + message='Agent.experience: invalid {issue} for {name} internal input.' + )) + assertions.extend(self.auxiliaries_spec.tf_assert( + x=auxiliaries, batch_size=batch_size, + message='Agent.experience: invalid {issue} for {name} input.' + )) + assertions.extend(self.actions_spec.tf_assert( + x=actions, batch_size=batch_size, + message='Agent.experience: invalid {issue} for {name} action input.' + )) + assertions.extend(self.terminal_spec.tf_assert( + x=terminal, batch_size=batch_size, + message='Agent.experience: invalid {issue} for terminal input.' + )) + assertions.extend(self.reward_spec.tf_assert( + x=reward, batch_size=batch_size, + message='Agent.experience: invalid {issue} for reward input.' + )) + # Mask assertions + if self.config.enable_int_action_masking: + for name, spec in self.actions_spec.items(): + if spec.type == 'int' and spec.num_values is not None: + is_valid = tf.reduce_all(input_tensor=tf.gather( + params=auxiliaries[name]['mask'], + indices=tf.expand_dims(input=actions[name], axis=(spec.rank + 1)), + batch_dims=(spec.rank + 1) + )) + assertions.append(tf.debugging.assert_equal( + x=is_valid, y=true, message="Agent.experience: invalid action / mask." + )) + # Assertion: buffer indices is zero + assertions.append(tf.debugging.assert_equal( + x=tf.math.reduce_sum(input_tensor=self.buffer_index, axis=0), y=zero, + message="Agent.experience: cannot be called mid-episode." + )) + # Assertion: one terminal + num_terms = tf.math.count_nonzero(input=terminal, dtype=tf_util.get_dtype(type='int')) + assertions.append(tf.debugging.assert_equal( + x=num_terms, y=one, + message="Agent.experience: input contains none or more than one terminal." + )) + # Assertion: terminal is last timestep in batch + assertions.append(tf.debugging.assert_greater_equal( + x=terminal[-1], y=one, + message="Agent.experience: terminal is not the last input timestep." + )) + + with tf.control_dependencies(control_inputs=assertions): + # Preprocessing + for name in states: + if name in self.state_preprocessing: + states[name] = self.state_preprocessing[name].apply( + x=states[name], deterministic=true, independent=False + ) + if self.reward_processing is not None: + reward = self.reward_processing.apply( + x=reward, deterministic=true, independent=False + ) + + # Core experience + experienced = self.core_experience( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=terminal, reward=reward + ) + + # Increment timestep and episode + with tf.control_dependencies(control_inputs=(experienced,)): + assignments = list() + assignments.append(self.timesteps.assign_add(delta=batch_size, read_value=False)) + assignments.append(self.episodes.assign_add(delta=one, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): + timestep = tf_util.identity(input=self.timesteps) + episode = tf_util.identity(input=self.episodes) + return timestep, episode + + @tf_function(num_args=0, api_function=True) + def update(self): + # Core update + updated = self.core_update() + + with tf.control_dependencies(control_inputs=(updated,)): + return tf_util.identity(input=self.updates) + + @tf_function(num_args=5) + def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): + zero_float = tf_util.constant(value=0.0, dtype='float') + + # On-policy policy/baseline horizon (TODO: retrieve from buffer!) + assertions = list() + if self.config.create_tf_assertions: + zero = tf_util.constant(value=0, dtype='int') + past_horizon = tf.math.maximum( + x=self.policy.past_horizon(on_policy=True), + y=self.baseline.past_horizon(on_policy=True) + ) + assertions.append(tf.debugging.assert_equal( + x=past_horizon, y=zero, + message="Policy/baseline on-policy horizon currently not supported." + )) + if not independent: + false = tf_util.constant(value=False, dtype='bool') + assertions.append(tf.debugging.assert_equal( + x=deterministic, y=false, + message="Invalid combination deterministic and not independent." + )) + + # Variable noise + if len(self.policy.trainable_variables) > 0 and ( + (not independent and not self.variable_noise.is_constant(value=0.0)) or + (independent and self.variable_noise.final_value() != 0.0) + ): + if independent: + variable_noise = tf_util.constant( + value=self.variable_noise.final_value(), dtype=self.variable_noise.spec.type + ) + else: + variable_noise = self.variable_noise.value() + + def no_variable_noise(): + return [tf.zeros_like(input=var) for var in self.policy.trainable_variables] + + def apply_variable_noise(): + variable_noise_tensors = list() + for variable in self.policy.trainable_variables: + noise = tf.random.normal( + shape=tf_util.shape(x=variable), mean=0.0, stddev=variable_noise, + dtype=self.variable_noise.spec.tf_type() + ) + if variable.dtype != tf_util.get_dtype(type='float'): + noise = tf.cast(x=noise, dtype=variable.dtype) + assignment = variable.assign_add(delta=noise, read_value=False) + with tf.control_dependencies(control_inputs=(assignment,)): + variable_noise_tensors.append(tf_util.identity(input=noise)) + return variable_noise_tensors + + variable_noise_tensors = tf.cond( + pred=tf.math.logical_or( + x=deterministic, y=tf.math.equal(x=variable_noise, y=zero_float) + ), true_fn=no_variable_noise, false_fn=apply_variable_noise + ) + + else: + variable_noise_tensors = list() + + with tf.control_dependencies(control_inputs=(variable_noise_tensors + assertions)): + dependencies = list() + + # State preprocessing (after variable noise) + for name in self.states_spec: + if name in self.state_preprocessing: + states[name] = self.state_preprocessing[name].apply( + x=states[name], deterministic=deterministic, independent=independent + ) + + # Policy act (after variable noise) + batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int') + starts = tf.range(batch_size, dtype=tf_util.get_dtype(type='int')) + lengths = tf_util.ones(shape=tf.expand_dims(input=batch_size, axis=0), dtype='int') + horizons = tf.stack(values=(starts, lengths), axis=1) + next_internals = TensorDict() + actions, next_internals['policy'] = self.policy.act( + states=states, horizons=horizons, internals=internals['policy'], + auxiliaries=auxiliaries, deterministic=deterministic, independent=independent + ) + if isinstance(actions, tf.Tensor): + dependencies.append(actions) + else: + dependencies.extend(actions.flatten()) + + # Baseline internals (after variable noise) + # TODO: shouldn't be required for independent-act + if self.separate_baseline and len(self.internals_spec['baseline']) > 0: + next_internals['baseline'] = self.baseline.next_internals( + states=states, horizons=horizons, internals=internals['baseline'], + actions=actions, deterministic=deterministic, independent=independent + ) + else: + next_internals['baseline'] = TensorDict() + dependencies.extend(next_internals.flatten()) + + # Reverse variable noise (after policy act) + if len(variable_noise_tensors) > 0: + with tf.control_dependencies(control_inputs=dependencies): + dependencies = list() + + def apply_variable_noise(): + assignments = list() + for var, noise in zip(self.policy.trainable_variables, variable_noise_tensors): + assignments.append(var.assign_sub(delta=noise, read_value=False)) + return tf.group(*assignments) + + dependencies.append(tf.cond( + pred=tf.math.equal(x=variable_noise, y=zero_float), + true_fn=tf.no_op, false_fn=apply_variable_noise + )) + + # Exploration + if (not independent and ( + isinstance(self.exploration, dict) or not self.exploration.is_constant(value=0.0) + )) or (independent and ( + isinstance(self.exploration, dict) or self.exploration.final_value() != 0.0 + )): + + # Global exploration + if not isinstance(self.exploration, dict): + # exploration_fns = dict() + if not independent and not self.exploration.is_constant(value=0.0): + exploration = self.exploration.value() + elif independent and self.exploration.final_value() != 0.0: + exploration = tf_util.constant( + value=self.exploration.final_value(), dtype=self.exploration.spec.type + ) + else: + assert False + + float_dtype = tf_util.get_dtype(type='float') + for name, spec, action in self.actions_spec.zip_items(actions): + + # Per-action exploration + if isinstance(self.exploration, dict): + if name not in self.exploration: + continue + elif not independent and not self.exploration[name].is_constant(value=0.0): + exploration = self.exploration.value() + elif independent and self.exploration[name].final_value() != 0.0: + exploration = tf_util.constant( + value=self.exploration[name].final_value(), + dtype=self.exploration[name].spec.type + ) + else: + continue + + # Apply exploration + if spec.type == 'bool': + # Bool action: if uniform[0, 1] < exploration, then uniform[True, False] + + def apply_exploration(): + shape = tf_util.cast(x=tf.shape(input=action), dtype='int') + half = tf_util.constant(value=0.5, dtype='float') + random_action = tf.random.uniform(shape=shape, dtype=float_dtype) < half + is_random = tf.random.uniform(shape=shape, dtype=float_dtype) < exploration + return tf.where(condition=is_random, x=random_action, y=action) + + elif spec.type == 'int' and spec.num_values is not None: + if self.config.enable_int_action_masking: + # Masked action: if uniform[0, 1] < exploration, then uniform[unmasked] + # (Similar code as for RandomModel.core_act) + + def apply_exploration(): + shape = tf_util.cast(x=tf.shape(input=action), dtype='int') + mask = auxiliaries[name]['mask'] + choices = tf_util.constant( + value=list(range(spec.num_values)), dtype=spec.type, + shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) + ) + one = tf_util.constant(value=1, dtype='int', shape=(1,)) + multiples = tf.concat(values=(shape, one), axis=0) + choices = tf.tile(input=choices, multiples=multiples) + choices = tf.boolean_mask(tensor=choices, mask=mask) + num_valid = tf.math.count_nonzero(input=mask, axis=(spec.rank + 1)) + num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) + masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) + uniform = tf.random.uniform(shape=shape, dtype=float_dtype) + uniform = tf.reshape(tensor=uniform, shape=(-1,)) + num_valid = tf_util.cast(x=num_valid, dtype='float') + random_offset = tf.dtypes.cast( + x=(uniform * num_valid), dtype=tf.dtypes.int64 + ) + random_action = tf.gather( + params=choices, indices=(masked_offset + random_offset) + ) + random_action = tf.reshape(tensor=random_action, shape=shape) + is_random = tf.random.uniform(shape=shape, dtype=float_dtype) + is_random = is_random < exploration + return tf.where(condition=is_random, x=random_action, y=action) + + else: + # Int action: if uniform[0, 1] < exploration, then uniform[num_values] + + def apply_exploration(): + shape = tf_util.cast(x=tf.shape(input=action), dtype='int') + random_action = tf.random.uniform( + shape=shape, maxval=spec.num_values, dtype=spec.tf_type() + ) + is_random = tf.random.uniform(shape=shape, dtype=float_dtype) + is_random = is_random < exploration + return tf.where(condition=is_random, x=random_action, y=action) + + else: + # Int/float action: action + normal[0, exploration] + + def apply_exploration(): + shape = tf_util.cast(x=tf.shape(input=action), dtype='int') + noise = tf.random.normal(shape=shape, dtype=spec.tf_type()) + x = action + noise * exploration + + # Clip action if left-/right-bounded + if spec.min_value is not None: + x = tf.math.maximum(x=x, y=spec.min_value) + if spec.max_value is not None: + x = tf.math.minimum(x=x, y=spec.max_value) + return x + + # if isinstance(self.exploration, dict): + # Per-action exploration + actions[name] = tf.cond( + pred=tf.math.logical_or( + x=deterministic, y=tf.math.equal(x=exploration, y=zero_float) + ), true_fn=(lambda: action), false_fn=apply_exploration + ) + + # else: + # exploration_fns[name] = apply_exploration + + # if not isinstance(self.exploration, dict): + # # Global exploration + + # def apply_exploration(): + # for name in self.actions_spec: + # actions[name] = exploration_fns[name]() + # return actions + + # actions = tf.cond( + # pred=tf.math.equal(x=exploration, y=zero_float), + # true_fn=(lambda: actions), false_fn=apply_exploration + # ) + + # Update states/internals/auxiliaries/actions buffers + if not independent: + assignments = list() + buffer_index = tf.gather(params=self.buffer_index, indices=parallel) + if self.circular_buffer: + buffer_index = tf.math.mod(x=buffer_index, y=self.buffer_capacity) + indices = tf.stack(values=(parallel, buffer_index), axis=1) + for name, buffer, state in self.states_buffer.zip_items(states): + value = tf.tensor_scatter_nd_update(tensor=buffer, indices=indices, updates=state) + assignments.append(buffer.assign(value=value)) + # assignments.append(buffer.scatter_nd_update(indices=indices, updates=state)) + for name, buffer, internal in self.internals_buffer.zip_items(internals): # not next_* + value = tf.tensor_scatter_nd_update( + tensor=buffer, indices=indices, updates=internal + ) + assignments.append(buffer.assign(value=value)) + # assignments.append(buffer.scatter_nd_update(indices=indices, updates=internal)) + for name, buffer, auxiliary in self.auxiliaries_buffer.zip_items(auxiliaries): + value = tf.tensor_scatter_nd_update( + tensor=buffer, indices=indices, updates=auxiliary + ) + assignments.append(buffer.assign(value=value)) + # assignments.append(buffer.scatter_nd_update(indices=indices, updates=auxiliary)) + for name, buffer, action in self.actions_buffer.zip_items(actions): + value = tf.tensor_scatter_nd_update(tensor=buffer, indices=indices, updates=action) + assignments.append(buffer.assign(value=value)) + # assignments.append(buffer.scatter_nd_update(indices=indices, updates=action)) + + # Increment buffer index (after buffer assignments) + with tf.control_dependencies(control_inputs=assignments): + ones = tf_util.ones(shape=tf.expand_dims(input=batch_size, axis=0), dtype='int') + indices = tf.expand_dims(input=parallel, axis=1) + value = tf.tensor_scatter_nd_add( + tensor=self.buffer_index, indices=indices, updates=ones + ) + dependencies.append(self.buffer_index.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=ones, indices=parallel) + # dependencies.append(self.buffer_index.scatter_add(sparse_delta=sparse_delta)) + + with tf.control_dependencies(control_inputs=dependencies): + actions = actions.fmap( + function=(lambda name, x: tf_util.identity(input=x, name=name)), with_names=True + ) + next_internals = next_internals.fmap( + function=(lambda name, x: tf_util.identity(input=x, name=name)), with_names=True + ) + return actions, next_internals + + @tf_function(num_args=3) + def core_observe(self, *, terminal, reward, parallel): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + buffer_index = tf.gather(params=self.buffer_index, indices=parallel) + batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') + expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) + if self.circular_buffer: + buffer_start = tf.gather(params=self.buffer_start, indices=parallel) + + # Assertion: size of terminal equals number of buffered timesteps + assertions = list() + # if self.config.create_tf_assertions: + # if self.circular_buffer: + # maybe_one = tf.minimum(x=buffer_index, y=self.reward_horizon.value()) + # assertions.append(tf.debugging.assert_equal( + # x=batch_size, y=(buffer_index - buffer_start - maybe_one), + # message="Agent.observe: number of observe-timesteps has to be equal to number " + # "of buffered act-timesteps." + # )) + # else: + # assertions.append(tf.debugging.assert_equal( + # x=batch_size, y=buffer_index, + # message="Agent.observe: number of observe-timesteps has to be equal to number " + # "of buffered act-timesteps." + # )) + + if self.config.buffer_observe == 'episode': + # Observe inputs are always buffered in agent until episode is terminated + # --> Call core_experience directly, no need for terminal/reward buffers + + def fn_nonterminal(): + # Should not be called + return tf.debugging.assert_equal(x=batch_size, y=zero) + + def fn_terminal(): + # Gather values from buffers, and episode experience + function = (lambda x: x[parallel, :buffer_index]) + states = self.states_buffer.fmap(function=function, cls=TensorDict) + internals = self.internals_buffer.fmap(function=function, cls=TensorDict) + auxiliaries = self.auxiliaries_buffer.fmap(function=function, cls=TensorDict) + actions = self.actions_buffer.fmap(function=function, cls=TensorDict) + return self.core_experience( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=terminal, reward=reward + ) + + elif self.reward_horizon == 'episode' or self.parallel_interactions > 1: + # Observe inputs need to be buffered until episode is terminated + # --> Call core_experience if terminal, otherwise buffer terminal/reward + batch_parallel = tf.fill(dims=(batch_size,), value=parallel) + + def fn_nonterminal(): + # Update terminal/reward buffers + assignments = list() + indices = tf.range(start=(buffer_index - batch_size), limit=buffer_index) + indices = tf.stack(values=(batch_parallel, indices), axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.terminal_buffer, indices=indices, updates=terminal + ) + assignments.append(self.terminal_buffer.assign(value=value)) + value = tf.tensor_scatter_nd_update( + tensor=self.reward_buffer, indices=indices, updates=reward + ) + assignments.append(self.reward_buffer.assign(value=value)) + return tf.group(assignments) + + def fn_terminal(): + # Gather values from buffers, and episode experience + function = (lambda x: x[parallel, :buffer_index]) + states = self.states_buffer.fmap(function=function, cls=TensorDict) + internals = self.internals_buffer.fmap(function=function, cls=TensorDict) + auxiliaries = self.auxiliaries_buffer.fmap(function=function, cls=TensorDict) + actions = self.actions_buffer.fmap(function=function, cls=TensorDict) + episode_terminal = self.terminal_buffer[parallel, :buffer_index - batch_size] + episode_reward = self.reward_buffer[parallel, :buffer_index - batch_size] + episode_terminal = tf.concat(values=(episode_terminal, terminal), axis=0) + episode_reward = tf.concat(values=(episode_reward, reward), axis=0) + return self.core_experience( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=episode_terminal, reward=episode_reward + ) + + else: + # Observe inputs are buffered temporarily and return is computed as soon as possible + # --> Call core_experience if terminal, otherwise ??? + capacity = tf_util.constant(value=self.buffer_capacity, dtype='int') + reward_horizon = self.reward_horizon.value() + reward_discount = self.reward_discount.value() + batch_parallel = tf.fill(dims=(batch_size,), value=parallel) + + def fn_nonterminal(): + # Update terminal/reward buffers + assignments = list() + indices = tf.range(start=(buffer_index - batch_size), limit=buffer_index) + indices = tf.math.mod(x=indices, y=capacity) + indices = tf.stack(values=(batch_parallel, indices), axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.terminal_buffer, indices=indices, updates=terminal + ) + assignments.append(self.terminal_buffer.assign(value=value)) + value = tf.tensor_scatter_nd_update( + tensor=self.reward_buffer, indices=indices, updates=reward + ) + assignments.append(self.reward_buffer.assign(value=value)) + with tf.control_dependencies(control_inputs=assignments): + # Number of completed timesteps to process + num_complete = buffer_index - buffer_start - reward_horizon + + def true_fn(): + return self._nonterminal_experience( + parallel=parallel, buffer_start=buffer_start, buffer_index=buffer_index, + reward_horizon=reward_horizon, num_complete=num_complete, + reward_discount=reward_discount + ) + + return tf.cond(pred=(num_complete > zero), true_fn=true_fn, false_fn=tf.no_op) + + def fn_terminal(): + # Gather values from buffers + indices = tf.range(start=buffer_start, limit=buffer_index) + indices = tf.math.mod(x=indices, y=capacity) + function = (lambda x: tf.gather(params=x[parallel], indices=indices)) + states = self.states_buffer.fmap(function=function, cls=TensorDict) + internals = self.internals_buffer.fmap(function=function, cls=TensorDict) + auxiliaries = self.auxiliaries_buffer.fmap(function=function, cls=TensorDict) + actions = self.actions_buffer.fmap(function=function, cls=TensorDict) + indices = tf.range(buffer_start, buffer_index - batch_size) + indices = tf.math.mod(x=indices, y=capacity) + episode_terminal = tf.gather(params=self.terminal_buffer[parallel], indices=indices) + episode_reward = tf.gather(params=self.reward_buffer[parallel], indices=indices) + episode_terminal = tf.concat(values=(episode_terminal, terminal), axis=0) + episode_reward = tf.concat(values=(episode_reward, reward), axis=0) + + # Episode experience + experienced = self.core_experience( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=episode_terminal, reward=episode_reward + ) + + # Increment buffer start index + with tf.control_dependencies(control_inputs=(indices,)): + zeros = tf_util.zeros(shape=(1,), dtype='int') + value = tf.tensor_scatter_nd_update( + tensor=self.buffer_start, indices=expanded_parallel, updates=zeros + ) + assignment = self.buffer_start.assign(value=value) + # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) + # assignment = self.buffer_start.scatter_update(sparse_delta=sparse_delta) + + return tf.group((experienced, assignment)) + + def fn_terminal_continuation(): + # Appropriate terminal function above + operations = [fn_terminal()] + + # Reset buffer index + with tf.control_dependencies(control_inputs=operations): + updates = tf_util.zeros(shape=(1,), dtype='int') + indices = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) + value = tf.tensor_scatter_nd_update( + tensor=self.buffer_index, indices=indices, updates=updates + ) + operations.append(self.buffer_index.assign(value=value)) + # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) + # operations.append(self.buffer_index.scatter_update(sparse_delta=sparse_delta)) + + # Preprocessed episode reward summaries (before preprocessed episode reward reset) + if self.reward_processing is not None: + dependencies = list() + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.gather(params=self.preprocessed_episode_return, indices=parallel) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='preprocessed-episode-return', data=x, step=self.episodes + )) + dependencies.extend(self.track( + label='reward', name='preprocessed-episode-return', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + # Reset preprocessed episode reward + with tf.control_dependencies(control_inputs=dependencies): + zeros = tf_util.zeros(shape=(1,), dtype='float') + value = tf.tensor_scatter_nd_update( + tensor=self.preprocessed_episode_return, indices=expanded_parallel, + updates=zeros + ) + operations.append(self.preprocessed_episode_return.assign(value=value)) + # zero_float = tf_util.constant(value=0.0, dtype='float') + # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) + # operations.append( + # self.preprocessed_episode_return.scatter_update(sparse_delta=sparse_delta) + # ) + + # Reset preprocessors + for preprocessor in self.state_preprocessing.values(): + operations.append(preprocessor.reset()) + if self.reward_processing is not None: + operations.append(self.reward_processing.reset()) + + return tf.group(*operations) + + # Reward preprocessing + dependencies = assertions + if self.reward_processing is not None: + with tf.control_dependencies(control_inputs=dependencies): + dependencies = list() + true = tf_util.constant(value=True, dtype='bool') + reward = self.reward_processing.apply( + x=reward, deterministic=true, independent=False + ) + + # Preprocessed reward summary + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='preprocessed-reward', data=x, step=self.timesteps + )) + dependencies.extend(self.track( + label='reward', name='preprocessed-reward', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + # Update preprocessed episode reward + sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True) + value = tf.tensor_scatter_nd_add( + tensor=self.preprocessed_episode_return, indices=expanded_parallel, + updates=sum_reward + ) + dependencies.append(self.preprocessed_episode_return.assign(value=value)) + # sum_reward = tf.math.reduce_sum(input_tensor=reward) + # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel) + # dependencies.append( + # self.preprocessed_episode_return.scatter_add(sparse_delta=sparse_delta) + # ) + + # Handle terminal vs non-terminal (after preprocessed episode reward) + with tf.control_dependencies(control_inputs=dependencies): + is_terminal = tf.concat(values=([zero], terminal), axis=0)[-1] > zero + experienced = tf.cond( + pred=is_terminal, true_fn=fn_terminal_continuation, false_fn=fn_nonterminal + ) + + # Handle periodic update + with tf.control_dependencies(control_inputs=(experienced,)): + if self.update_frequency is None: + updated = tf_util.constant(value=False, dtype='bool') + + else: + frequency = self.update_frequency.value() + start = self.update_start.value() + + if self.update_unit == 'timesteps': + # Timestep-based batch + past_horizon = tf.math.maximum( + x=self.policy.past_horizon(on_policy=False), + y=self.baseline.past_horizon(on_policy=False) + ) + unit = self.timesteps + start = tf.math.maximum(x=start, y=(frequency + past_horizon)) + if self.reward_horizon == 'episode': + min_start = tf.where( + condition=(self.episodes > zero), x=start, y=(unit + one) + ) + start = tf.math.maximum(x=start, y=min_start) + else: + two = tf_util.constant(value=2, dtype='int') + start = tf.where( + condition=(self.episodes > zero), x=zero, + y=(start + two * self.reward_horizon.value()) + ) + if self.config.buffer_observe == 'episode': + min_start = tf.where( + condition=(self.episodes > zero), x=start, y=(unit + one) + ) + start = tf.math.maximum(x=start, y=min_start) + else: + buffer_observe = tf_util.constant( + value=self.config.buffer_observe, dtype='int' + ) + start = tf.math.maximum(x=start, y=buffer_observe) + + elif self.update_unit == 'episodes': + # Episode-based batch + start = tf.math.maximum(x=start, y=frequency) + # (Episode counter is only incremented at the end of observe) + unit = self.episodes + tf.where(condition=is_terminal, x=one, y=zero) + + unit = unit - start + is_frequency = tf.math.greater_equal(x=unit, y=(self.last_update + frequency)) + + def perform_update(): + assignment = self.last_update.assign(value=unit, read_value=False) + with tf.control_dependencies(control_inputs=(assignment,)): + return self.core_update() + + def no_update(): + return tf_util.constant(value=False, dtype='bool') + + updated = tf.cond(pred=is_frequency, true_fn=perform_update, false_fn=no_update) + + with tf.control_dependencies(control_inputs=(updated,)): + return tf_util.identity(input=updated) + + def _nonterminal_experience( + self, *, parallel, buffer_start, buffer_index, reward_horizon, num_complete, reward_discount + ): + # (similar to _terminal_experience_parallel) + one = tf_util.constant(value=1, dtype='int') + capacity = tf_util.constant(value=self.buffer_capacity, dtype='int') + + # Whether to predict horizon values now + if self.predict_horizon_values != 'early': + assert self.trace_decay.is_constant(value=1.0) + horizon_values = tf_util.zeros( + shape=tf.expand_dims(input=num_complete, axis=0), dtype='float' + ) + + else: + # Baseline horizon + baseline_horizon = self.baseline.past_horizon(on_policy=True) + if self.trace_decay.is_constant(value=1.0): + assertion = tf.debugging.assert_less_equal( + x=baseline_horizon, y=reward_horizon, + message="Baseline on-policy horizon greater than reward estimation horizon " + "currently not supported if prediction_horizon_values = \"early\"." + ) + else: + zero = tf_util.constant(value=0, dtype='int') + assertion = tf.debugging.assert_less_equal( + x=baseline_horizon, y=zero, + message="Baseline on-policy horizon currently not supported if " + "trace_decay != 1.0." + ) + + with tf.control_dependencies(control_inputs=(assertion,)): + + # Index range to gather from buffers + if self.trace_decay.is_constant(value=1.0): + # Only indices relevant for horizon values + indices = tf.range( + start=(buffer_start + reward_horizon - baseline_horizon), limit=buffer_index + ) + ints_end = num_complete + auxs_start = baseline_horizon + horizons_start = tf.range(num_complete) + horizons_length = tf.fill(dims=(num_complete,), value=(baseline_horizon + one)) + else: + # All indices + indices = tf.range(start=(buffer_start + one), limit=buffer_index) + ints_end = None + auxs_start = None + horizons_start = tf.range(buffer_index - buffer_start - one) + horizons_length = tf.ones_like(input=horizons_start) + indices = tf.math.mod(x=indices, y=capacity) + + # Return-sequence per timestep, as horizons indexing tensor + horizons = tf.stack(values=(horizons_start, horizons_length), axis=1) + + # Gather states + function = (lambda x: tf.gather(params=x[parallel], indices=indices)) + states = self.states_buffer.fmap(function=function, cls=TensorDict) + + # Gather internals, only for return-sequence start + function = (lambda x: tf.gather(params=x[parallel], indices=indices[:ints_end])) + key = ('baseline' if self.separate_baseline else 'policy') + if len(self.internals_spec[key]) > 0: + internals = self.internals_buffer[key].fmap(function=function, cls=TensorDict) + else: + internals = TensorDict() + + # Gather auxiliaries (and actions), only for return-sequence end + function = (lambda x: tf.gather(params=x[parallel], indices=indices[auxs_start:])) + auxiliaries = self.auxiliaries_buffer.fmap(function=function, cls=TensorDict) + + # Predict values + if self.predict_action_values: + # TODO: option to re-sample action deterministically? + actions = self.actions_buffer.fmap(function=function, cls=TensorDict) + values = self.baseline.action_value( + states=states, horizons=horizons, internals=internals, + auxiliaries=auxiliaries, actions=actions + ) + else: + values = self.baseline.state_value( + states=states, horizons=horizons, internals=internals, + auxiliaries=auxiliaries + ) + + # Horizon values + if self.trace_decay.is_constant(value=1.0): + horizon_values = values + else: + horizon_values = values[reward_horizon - one:] + + # Gather all rewards (incl return-horizon) from buffer + indices = tf.range(start=buffer_start, limit=(buffer_index - one)) + indices = tf.math.mod(x=indices, y=capacity) + reward = tf.gather(params=self.reward_buffer[parallel], indices=indices) + + # Recursive return + if self.trace_decay.is_constant(value=1.0): + # Discounted cumulative sum + def recursive_return(next_return, index): + return reward[index: index + num_complete] + reward_discount * next_return + + else: + # TD-lambda + one_float = tf_util.constant(value=1.0, dtype='float') + trace_decay = self.trace_decay.value() + + def recursive_return(next_return, index): + next_value = values[index: index + num_complete] + next_return = (one_float - trace_decay) * next_value + trace_decay * next_return + return reward[index: index + num_complete] + reward_discount * next_return + + reward = tf.foldr( + fn=recursive_return, elems=tf.range(reward_horizon), initializer=horizon_values + ) + + # Gather other values of completed timesteps from buffers + indices = tf.range(start=buffer_start, limit=(buffer_start + num_complete)) + indices = tf.math.mod(x=indices, y=capacity) + function = (lambda x: tf.gather(params=x[parallel], indices=indices)) + states = self.states_buffer.fmap(function=function, cls=TensorDict) + internals = self.internals_buffer.fmap(function=function, cls=TensorDict) + auxiliaries = self.auxiliaries_buffer.fmap(function=function, cls=TensorDict) + actions = self.actions_buffer.fmap(function=function, cls=TensorDict) + terminal = function(self.terminal_buffer) + + # Store completed timesteps + experienced = self.memory.enqueue( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=terminal, reward=reward + ) + + # Increment buffer start index + with tf.control_dependencies(control_inputs=(indices,)): + updates = tf.expand_dims(input=num_complete, axis=0) + indices = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) + value = tf.tensor_scatter_nd_add( + tensor=self.buffer_start, indices=indices, updates=updates + ) + assignment = self.buffer_start.assign(value=value) + # sparse_delta = tf.IndexedSlices(values=num_complete, indices=parallel) + # assignment = self.buffer_start.scatter_add(sparse_delta=sparse_delta) + + return tf.group((experienced, assignment)) + + @tf_function(num_args=6) + def core_experience(self, *, states, internals, auxiliaries, actions, terminal, reward): + episode_length = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') + reward_discount = self.reward_discount.value() + + if self.reward_horizon == 'episode': + # Reward horizon is entire episode + reward = self._terminal_experience_iterative( + episode_length=episode_length, reward_discount=reward_discount, states=states, + internals=internals, auxiliaries=auxiliaries, actions=actions, reward=reward, + terminal=terminal + ) + + else: + # Optimize required loop iterations, so whether to process remaining timesteps + # - iteratively, if remaining episode length is at most reward horizon + # - in parallel, if reward horizon is less than remaining episode length + reward_horizon = self.reward_horizon.value() + + def true_fn(): + return self._terminal_experience_iterative( + episode_length=episode_length, reward_discount=reward_discount, states=states, + internals=internals, auxiliaries=auxiliaries, actions=actions, reward=reward, + terminal=terminal + ) + + def false_fn(): + return self._terminal_experience_parallel( + episode_length=episode_length, reward_horizon=reward_horizon, + reward_discount=reward_discount, states=states, internals=internals, + auxiliaries=auxiliaries, actions=actions, reward=reward, terminal=terminal + ) + + reward = tf.cond( + pred=(episode_length <= reward_horizon), true_fn=true_fn, false_fn=false_fn + ) + + # Store episode + return self.memory.enqueue( + states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, + terminal=terminal, reward=reward + ) + + def _terminal_experience_iterative( + self, *, episode_length, reward_discount, + states, internals, auxiliaries, actions, reward, terminal + ): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + zero_float = tf_util.constant(value=0.0, dtype='float') + internals = (internals['baseline'] if self.separate_baseline else internals['policy']) + + if self.trace_decay.is_constant(value=1.0): + # Whether to predict horizon/terminal values now + if self.predict_horizon_values != 'early': + + # Whether to predict all or only abort-terminals + # (-reward[-1] since terminal state value will be predicted) + terminal_value = -reward[-1] / reward_discount + if not self.predict_terminal_values: + is_terminal = tf.math.equal(x=terminal[-1], y=one) + terminal_value = tf.where(condition=is_terminal, x=zero_float, y=terminal_value) + + else: + + def predict_terminal_value(): + # Baseline horizon + baseline_horizon = self.baseline.past_horizon(on_policy=True) + baseline_horizon = tf.math.minimum(x=baseline_horizon, y=episode_length) + + # Single-step horizon + horizon_start = episode_length - one - baseline_horizon + horizons = tf.expand_dims( + input=tf.stack(values=(zero, baseline_horizon + one)), axis=0 + ) + + # Predict values + if self.predict_action_values: + # TODO: option to re-sample action deterministically? + # Use given actions since early estimate + # if self.separate_baseline: + # policy_horizon = self.policy.past_horizon(on_policy=True) + # policy_horizon = tf.math.minimum(x=policy_horizon, y=episode_length) + # policy_horizon_start = terminal_index - policy_horizon + # else: + # policy_horizon_start = past_horizon_start + # deterministic = tf_util.constant(value=True, dtype='bool') + # _actions, _ = self.policy.act( + # states=states[policy_horizon_start:], horizons=horizons[:maybe_one], + # internals=internals['policy'][policy_horizon_start: policy_horizon_start + maybe_one], + # auxiliaries=auxiliaries[terminal_index:], deterministic=deterministic, + # independent=True + # ) + terminal_value = self.baseline.action_value( + states=states[horizon_start:], horizons=horizons, + internals=internals[horizon_start: horizon_start + one], + auxiliaries=auxiliaries[-1:], + actions=actions[-1:] + ) + else: + terminal_value = self.baseline.state_value( + states=states[horizon_start:], horizons=horizons, + internals=internals[horizon_start: horizon_start + one], + auxiliaries=auxiliaries[-1:] + ) + + # Modification to correct for use as initializer in tf.scan + # (-reward[-1] since terminal state value will be predicted) + return (terminal_value[0] - reward[-1]) / reward_discount + + # Whether to predict all or only abort-terminals + if self.predict_terminal_values: + terminal_value = predict_terminal_value() + else: + is_terminal = tf.math.equal(x=terminal[-1], y=one) + terminal_value = tf.cond( + pred=is_terminal, true_fn=(lambda: zero_float), + false_fn=predict_terminal_value + ) + + # Discounted cumulative sum return + def recursive_return(next_return, current_reward): + return current_reward + reward_discount * next_return + + return tf.scan( + fn=recursive_return, elems=reward, initializer=terminal_value, reverse=True + ) + + else: + # Baseline horizon + baseline_horizon = self.baseline.past_horizon(on_policy=True) + assertion = tf.debugging.assert_equal( + x=baseline_horizon, y=zero, + message="Baseline cannot have on-policy horizon if trace_decay != 1.0." + ) + + with tf.control_dependencies(control_inputs=(assertion,)): + # Baseline-horizon-sequence per timestep, as horizons indexing tensor + horizons_start = tf.range(episode_length - one) + horizons_length = tf.fill(dims=(episode_length - one,), value=one) + horizons = tf.stack(values=(horizons_start, horizons_length), axis=1) + + if self.predict_action_values: + # TODO: option to re-sample action deterministically? + values = self.baseline.action_value( + states=states[1:], horizons=horizons, internals=internals[1:], + auxiliaries=auxiliaries[1:], actions=actions[1:] + ) + else: + values = self.baseline.state_value( + states=states[1:], horizons=horizons, internals=internals[1:], + auxiliaries=auxiliaries[1:] + ) + + # Modification to correct for use as initializer in tf.scan + # (-reward[-1] since terminal state value will be predicted) + terminal_value = (values[-1] - reward[-1]) / reward_discount + + # Whether to predict all or only abort-terminals + if not self.predict_terminal_values: + is_terminal = tf.math.equal(x=terminal[-1], y=one) + terminal_value = tf.where(condition=is_terminal, x=zero_float, y=terminal_value) + + values = tf.concat(values=(values, [terminal_value]), axis=0) + + # TD-lambda return + one_float = tf_util.constant(value=1.0, dtype='float') + trace_decay = self.trace_decay.value() + + def recursive_return(next_return, reward_value): + current_reward, next_value = reward_value + next_return = (one_float - trace_decay) * next_value + trace_decay * next_return + return current_reward + reward_discount * next_return + + return tf.scan( + fn=recursive_return, elems=(reward, values), initializer=terminal_value, + reverse=True + ) + + def _terminal_experience_parallel( + self, *, episode_length, reward_horizon, reward_discount, + states, internals, auxiliaries, actions, reward, terminal + ): + # (similar to _nonterminal_experience) + one = tf_util.constant(value=1, dtype='int') + internals = (internals['baseline'] if self.separate_baseline else internals['policy']) + + # Whether to predict horizon values now + if self.predict_horizon_values != 'early': + assert self.trace_decay.is_constant(value=1.0) + + # Whether to predict all or only abort-terminals + terminal_value = tf_util.constant(value=0.0, dtype='float') + if not self.predict_terminal_values: + is_terminal = tf.math.equal(x=terminal[-1], y=one) + terminal_value = tf.where(condition=is_terminal, x=reward[-1], y=terminal_value) + + # Horizon-expanded rewards and values + horizon_values = tf_util.zeros( + shape=tf.expand_dims(input=episode_length, axis=0), dtype='float' + ) + reward = tf.concat( + values=(reward[:-1], [terminal_value], horizon_values[:reward_horizon]), axis=0 + ) + + else: + # Baseline horizon + baseline_horizon = self.baseline.past_horizon(on_policy=True) + assertions = list() # (control dependency below, before baseline call) + if not self.trace_decay.is_constant(value=1.0): + zero = tf_util.constant(value=0, dtype='int') + assertions.append(tf.debugging.assert_equal( + x=baseline_horizon, y=zero, + message="Baseline cannot have on-policy horizon if trace_decay != 1.0." + )) + + # Index starts/ends + if self.trace_decay.is_constant(value=1.0): + # Only indices relevant for horizon values + reward_horizon_start = reward_horizon + zero = tf_util.constant(value=0, dtype='int') + baseline_horizon_start = tf.maximum( + x=(reward_horizon_start - baseline_horizon), y=zero + ) + baseline_horizon_end = episode_length - baseline_horizon + baseline_horizon_end = tf.maximum(x=baseline_horizon_end, y=baseline_horizon_start) + horizons_start = tf.range(baseline_horizon_end - baseline_horizon_start) + horizons_length = reward_horizon_start + horizons_start + horizons_length = tf.math.minimum(x=horizons_length, y=(baseline_horizon + one)) + else: + # All indices + reward_horizon_start = 1 + baseline_horizon_start = 1 + baseline_horizon_end = None + horizons_start = tf.range(episode_length - one) + horizons_length = tf.ones_like(input=horizons_start) + + # Baseline-horizon-sequence per timestep, as horizons indexing tensor + horizons = tf.stack(values=(horizons_start, horizons_length), axis=1) + + # Predict values + with tf.control_dependencies(control_inputs=assertions): + if self.predict_action_values: + # TODO: option to re-sample action deterministically? + values = self.baseline.action_value( + states=states[baseline_horizon_start:], + horizons=horizons, + internals=internals[baseline_horizon_start: baseline_horizon_end], + auxiliaries=auxiliaries[reward_horizon_start:], + actions=actions[reward_horizon_start:] + ) + else: + values = self.baseline.state_value( + states=states[baseline_horizon_start:], + horizons=horizons, + internals=internals[baseline_horizon_start: baseline_horizon_end], + auxiliaries=auxiliaries[reward_horizon_start:] + ) + + # Whether to predict all or only abort-terminals + terminal_value = values[-1] + if not self.predict_terminal_values: + is_terminal = tf.math.equal(x=terminal[-1], y=one) + terminal_value = tf.where(condition=is_terminal, x=reward[-1], y=terminal_value) + + # Horizon-expanded rewards and values + zeros_reward_horizon = tf_util.zeros( + shape=tf.expand_dims(input=(reward_horizon - one), axis=0), dtype='float' + ) + reward = tf.concat(values=(reward[:-1], [terminal_value], zeros_reward_horizon), axis=0) + zeros_reward_horizon = tf_util.zeros( + shape=tf.expand_dims(reward_horizon, axis=0), dtype='float' + ) + values = tf.concat(values=(values, zeros_reward_horizon), axis=0) + + # Horizon values + if self.trace_decay.is_constant(value=1.0): + horizon_values = values + else: + horizon_values = values[reward_horizon - one:] + + # Recursive return + if self.trace_decay.is_constant(value=1.0): + # Discounted cumulative sum + def recursive_return(next_return, index): + return reward[index: index + episode_length] + reward_discount * next_return + + else: + # TD-lambda + one_float = tf_util.constant(value=1.0, dtype='float') + trace_decay = self.trace_decay.value() + + def recursive_return(next_return, index): + next_value = values[index: index + episode_length] + next_return = (one_float - trace_decay) * next_value + trace_decay * next_return + return reward[index: index + episode_length] + reward_discount * next_return + + return tf.foldr( + fn=recursive_return, elems=tf.range(reward_horizon), initializer=horizon_values + ) + + @tf_function(num_args=0) + def core_update(self): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + true = tf_util.constant(value=True, dtype='bool') + + # Retrieve batch + batch_size = self.update_batch_size.value() + if self.update_unit == 'timesteps': + # Timestep-based batch + # Dependency horizon + past_horizon = tf.math.maximum( + x=self.policy.past_horizon(on_policy=False), + y=self.baseline.past_horizon(on_policy=False) + ) + if self.predict_horizon_values != 'late': + future_horizon = zero + elif self.reward_horizon == 'episode': + future_horizon = tf_util.constant(value=self.max_episode_timesteps, dtype='int') + else: + future_horizon = self.reward_horizon.value() + indices = self.memory.retrieve_timesteps( + n=batch_size, past_horizon=past_horizon, future_horizon=future_horizon + ) + elif self.update_unit == 'episodes': + # Episode-based batch + indices = self.memory.retrieve_episodes(n=batch_size) + + # Retrieve states and internals + policy_horizon = self.policy.past_horizon(on_policy=False) + if self.separate_baseline and self.baseline_optimizer is None: + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_equal( + x=policy_horizon, y=self.baseline.past_horizon(on_policy=False), + message="Policy and baseline cannot depend on a different number of previous " + "states if baseline_optimizer is None." + )) + with tf.control_dependencies(control_inputs=assertions): + policy_horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=policy_horizon, sequence_values=('states',), + initial_values=('internals',) + ) + baseline_horizons = policy_horizons + baseline_states = policy_states = sequence_values['states'] + internals = policy_internals = initial_values['internals'] + if self.separate_baseline: + baseline_internals = policy_internals['baseline'] + else: + baseline_internals = policy_internals + else: + if self.baseline_optimizer is None: + policy_horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=policy_horizon, sequence_values=('states',), + initial_values=('internals',) + ) + policy_states = sequence_values['states'] + internals = policy_internals = initial_values['internals'] + elif len(self.internals_spec['policy']) > 0: + policy_horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=policy_horizon, sequence_values=('states',), + initial_values=('internals/policy',) + ) + policy_states = sequence_values['states'] + internals = initial_values['internals'] + policy_internals = initial_values['internals/policy'] + else: + policy_horizons, sequence_values = self.memory.predecessors( + indices=indices, horizon=policy_horizon, sequence_values=('states',), + initial_values=() + ) + policy_states = sequence_values['states'] + internals = policy_internals = TensorDict() + # Optimize !!!!! + baseline_horizon = self.baseline.past_horizon(on_policy=False) + if self.separate_baseline: + if len(self.internals_spec['baseline']) > 0: + baseline_horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=baseline_horizon, sequence_values=('states',), + initial_values=('internals/baseline',) + ) + baseline_states = sequence_values['states'] + internals = initial_values['internals'] + baseline_internals = initial_values['internals/baseline'] + else: + baseline_horizons, sequence_values = self.memory.predecessors( + indices=indices, horizon=baseline_horizon, sequence_values=('states',), + initial_values=() + ) + baseline_states = sequence_values['states'] + internals = baseline_internals = TensorDict() + else: + if len(self.internals_spec['policy']) > 0: + baseline_horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=baseline_horizon, sequence_values=('states',), + initial_values=('internals/policy',) + ) + baseline_states = sequence_values['states'] + internals = initial_values['internals'] + baseline_internals = initial_values['internals/policy'] + else: + baseline_horizons, sequence_values = self.memory.predecessors( + indices=indices, horizon=baseline_horizon, sequence_values=('states',), + initial_values=() + ) + baseline_states = sequence_values['states'] + internals = baseline_internals = TensorDict() + + # Retrieve auxiliaries, actions, reward + if self.gae_decay.is_constant(value=0.0): + values = self.memory.retrieve( + indices=indices, values=('auxiliaries', 'actions', 'reward') + ) + else: + values = self.memory.retrieve( + indices=indices, values=('auxiliaries', 'actions', 'reward', 'terminal') + ) + terminal = values['terminal'] + auxiliaries = values['auxiliaries'] + actions = values['actions'] + reward = values['reward'] + + # Return estimation + if self.predict_horizon_values == 'late': + reward = self._complete_horizon_values( + indices=indices, internals=internals, reward=reward + ) + + dependencies = [reward] + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='update-return', data=x, step=self.updates + )) + dependencies.extend(self.track(label='reward', name='update-return', data=x)) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + if self.return_processing is not None: + with tf.control_dependencies(control_inputs=dependencies): + reward = self.return_processing.apply( + x=reward, deterministic=true, independent=False + ) + + dependencies = [reward] + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='update-processed-return', data=x, step=self.updates + )) + dependencies.extend(self.track( + label='reward', name='update-processed-return', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + baseline_arguments = TensorDict( + states=baseline_states, horizons=baseline_horizons, internals=baseline_internals, + auxiliaries=auxiliaries, actions=actions, reward=reward + ) + if self.baseline_objective is not None: + baseline_arguments['reference'] = self.baseline_objective.reference( + states=baseline_states, horizons=baseline_horizons, internals=baseline_internals, + auxiliaries=auxiliaries, actions=actions, policy=self.baseline + ) + + if self.baseline_optimizer is not None and self.estimate_advantage != 'early': + def fn_kl_divergence( + *, states, horizons, internals, auxiliaries, actions, reward, reference + ): + reference = self.baseline.kldiv_reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + return self.baseline.kl_divergence( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + reference=reference + ) + + variables = tuple(self.baseline.trainable_variables) + + kwargs = dict() + try: + ordered_names = [variable.name for variable in variables] + kwargs['source_variables'] = tuple(sorted( + self.policy.trainable_variables, + key=(lambda x: ordered_names.index(x.name.replace('/policy/', '/baseline/'))) + )) + except ValueError: + pass + + dependencies.extend(baseline_arguments.flatten()) + + # Optimization + with tf.control_dependencies(control_inputs=dependencies): + optimized = self.baseline_optimizer.update( + arguments=baseline_arguments, variables=variables, fn_loss=self.baseline_loss, + fn_kl_divergence=fn_kl_divergence, **kwargs + ) + dependencies = [optimized] + + with tf.control_dependencies(control_inputs=dependencies): + if self.estimate_advantage is not False and not self.advantage_in_loss: + if self.predict_action_values: + # Use past actions since advantage R(s,a) - Q(s,a) + baseline_prediction = self.baseline.action_value( + states=baseline_states, horizons=baseline_horizons, + internals=baseline_internals, auxiliaries=auxiliaries, actions=actions + ) + else: + baseline_prediction = self.baseline.state_value( + states=baseline_states, horizons=baseline_horizons, + internals=baseline_internals, auxiliaries=auxiliaries + ) + reward = reward - baseline_prediction + + dependencies = [reward] + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='update-advantage', data=x, step=self.updates + )) + dependencies.extend(self.track( + label='reward', name='update-advantage', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + if self.advantage_processing is not None: + with tf.control_dependencies(control_inputs=dependencies): + reward = self.advantage_processing.apply( + x=reward, deterministic=true, independent=False + ) + + dependencies = [reward] + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='update-processed-advantage', data=x, + step=self.updates + )) + dependencies.extend(self.track( + label='reward', name='update-processed-advantage', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + if not self.gae_decay.is_constant(value=0.0): + with tf.control_dependencies(control_inputs=dependencies): + # Requires consistent batch!!! + zero_float = tf_util.constant(value=0.0, dtype='float') + reward_discount = self.reward_discount.value() + gae_decay = self.gae_decay.value() + + # Discounted cumulative sum + def recursive_gae(next_gae, advantage_terminal): + current_advantage, current_terminal = advantage_terminal + next_gae = tf.where( + condition=(current_terminal == zero), x=next_gae, y=zero_float + ) + return current_advantage + reward_discount * gae_decay * next_gae + + reward = tf.scan( + fn=recursive_gae, elems=(reward, terminal), initializer=zero_float, + reverse=True + ) + + dependencies = [reward] + if self.summaries == 'all' or 'reward' in self.summaries or \ + self.tracking == 'all' or 'reward' in self.tracking: + if self.summaries == 'all' or 'reward' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + x = tf.math.reduce_mean(input_tensor=reward, axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name='update-gae', data=x, step=self.updates + )) + dependencies.extend(self.track( + label='reward', name='update-gae', data=x + )) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + if self.baseline_optimizer is None: + policy_only_internals = policy_internals['policy'] + else: + policy_only_internals = policy_internals + reference = self.objective.reference( + states=policy_states, horizons=policy_horizons, internals=policy_only_internals, + auxiliaries=auxiliaries, actions=actions, policy=self.policy + ) + if self.baseline_objective is not None and self.baseline_loss_weight is not None and \ + not self.baseline_loss_weight.is_constant(value=0.0): + reference = TensorDict(policy=reference, baseline=baseline_arguments['reference']) + + policy_arguments = TensorDict( + states=policy_states, horizons=policy_horizons, internals=policy_internals, + auxiliaries=auxiliaries, actions=actions, reward=reward, reference=reference + ) + + if self.estimate_advantage is not False and self.advantage_in_loss: + variables = tuple(self.trainable_variables) + + def fn_loss(*, states, horizons, internals, auxiliaries, actions, reward, reference): + assertions = list() + if self.config.create_tf_assertions: + past_horizon = self.baseline.past_horizon(on_policy=False) + # TODO: remove restriction + assertions.append(tf.debugging.assert_less_equal( + x=(horizons[:, 1] - one), y=past_horizon, + message="Baseline horizon cannot be greater than policy horizon." + )) + with tf.control_dependencies(control_inputs=assertions): + if self.predict_action_values: + # Use past actions since advantage R(s,a) - Q(s,a) + baseline_prediction = self.baseline.action_value( + states=states, horizons=horizons, internals=internals['baseline'], + auxiliaries=auxiliaries, actions=actions + ) + else: + baseline_prediction = self.baseline.state_value( + states=states, horizons=horizons, internals=internals['baseline'], + auxiliaries=auxiliaries + ) + reward = reward - baseline_prediction + + def fn_summary1(): + return tf.math.reduce_mean(input_tensor=reward, axis=0) + + dependencies = self.summary( + label='reward', name='update-advantage', data=fn_summary1, step='updates' + ) + dependencies.extend(self.track( + label='reward', name='update-advantage', data=fn_summary1 + )) + + if self.advantage_processing is not None: + with tf.control_dependencies(control_inputs=dependencies): + reward = self.advantage_processing.apply( + x=reward, deterministic=true, independent=False + ) + + def fn_summary2(): + return tf.math.reduce_mean(input_tensor=reward, axis=0) + + dependencies = self.summary( + label='reward', name='update-processed-advantage', + data=fn_summary2, step='updates' + ) + dependencies.extend(self.track( + label='reward', name='update-processed-advantage', data=fn_summary2 + )) + + with tf.control_dependencies(control_inputs=dependencies): + return self.loss( + states=states, horizons=horizons, internals=internals, + auxiliaries=auxiliaries, actions=actions, reward=reward, reference=reference + ) + + else: + variables = tuple(self.policy.trainable_variables) + fn_loss = self.loss + + def fn_kl_divergence( + *, states, horizons, internals, auxiliaries, actions, reward, reference + ): + if self.baseline_optimizer is None: + internals = internals['policy'] + # TODO: Policy require + reference = self.policy.kldiv_reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + return self.policy.kl_divergence( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + reference=reference + ) + + kwargs = dict() + if self.separate_baseline: + try: + ordered_names = [variable.name for variable in variables] + kwargs['source_variables'] = tuple(sorted( + self.baseline.trainable_variables, + key=(lambda x: ordered_names.index(x.name.replace('/baseline/', '/policy/'))) + )) + except ValueError: + pass + # if self.global_model is not None: + # assert 'global_variables' not in kwargs + # kwargs['global_variables'] = tuple(self.global_model.trainable_variables) + + dependencies.extend(policy_arguments.flatten()) + + # Hack: KL divergence summary: reference before update + if isinstance(self.policy, StochasticPolicy) and ( + self.summaries == 'all' or 'kl-divergence' in self.summaries or + self.tracking == 'all' or 'kl-divergence' in self.tracking + ): + kldiv_reference = self.policy.kldiv_reference( + states=policy_states, horizons=policy_horizons, internals=policy_only_internals, + auxiliaries=auxiliaries + ) + dependencies.extend(kldiv_reference.flatten()) + + # Optimization + with tf.control_dependencies(control_inputs=dependencies): + optimized = self.optimizer.update( + arguments=policy_arguments, variables=variables, fn_loss=fn_loss, + fn_kl_divergence=fn_kl_divergence, **kwargs + ) + dependencies = [optimized] + + if self.baseline_optimizer is not None and self.estimate_advantage == 'early': + def fn_kl_divergence( + *, states, horizons, internals, auxiliaries, actions, reward, reference + ): + reference = self.baseline.kldiv_reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + return self.baseline.kl_divergence( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + reference=reference + ) + + variables = tuple(self.baseline.trainable_variables) + + kwargs = dict() + try: + ordered_names = [variable.name for variable in variables] + kwargs['source_variables'] = tuple(sorted( + self.policy.trainable_variables, + key=(lambda x: ordered_names.index(x.name.replace('/policy/', '/baseline/'))) + )) + except ValueError: + pass + + dependencies.extend(baseline_arguments.flatten()) + + # Optimization + with tf.control_dependencies(control_inputs=dependencies): + optimized = self.baseline_optimizer.update( + arguments=baseline_arguments, variables=variables, fn_loss=self.baseline_loss, + fn_kl_divergence=fn_kl_divergence, **kwargs + ) + dependencies = [optimized] + + # Update summaries + with tf.control_dependencies(control_inputs=dependencies): + dependencies = list() + + # Entropy summaries + if isinstance(self.policy, StochasticPolicy) and ( + self.summaries == 'all' or 'entropy' in self.summaries or + self.tracking == 'all' or 'entropy' in self.tracking + ): + if self.summaries == 'all' or 'entropy' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + if len(self.actions_spec) > 1: + entropies = self.policy.entropies( + states=policy_states, horizons=policy_horizons, + internals=policy_only_internals, auxiliaries=auxiliaries + ) + for name, spec in self.actions_spec.items(): + entropies[name] = tf.reshape(tensor=entropies[name], shape=(-1,)) + entropy = tf.math.reduce_mean(input_tensor=entropies[name], axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name=('entropies/' + name), data=entropy, step=self.updates + )) + dependencies.extend(self.track( + label='entropy', name=('entropies/' + name), data=entropy + )) + entropy = tf.concat(values=tuple(entropies.values()), axis=0) + else: + entropy = self.policy.entropy( + states=policy_states, horizons=policy_horizons, + internals=policy_only_internals, auxiliaries=auxiliaries + ) + entropy = tf.math.reduce_mean(input_tensor=entropy, axis=0) + if summarizer is not None: + dependencies.append( + tf.summary.scalar(name='entropy', data=entropy, step=self.updates) + ) + dependencies.extend(self.track(label='entropy', name='entropy', data=entropy)) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + # KL divergence summaries + if isinstance(self.policy, StochasticPolicy) and ( + self.summaries == 'all' or 'kl-divergence' in self.summaries or + self.tracking == 'all' or 'kl-divergence' in self.tracking + ): + if self.summaries == 'all' or 'kl-divergence' in self.summaries: + summarizer = self.summarizer.as_default() + summarizer.__enter__() + else: + summarizer = None + if len(self.actions_spec) > 1: + kl_divs = self.policy.kl_divergences( + states=policy_states, horizons=policy_horizons, + internals=policy_only_internals, auxiliaries=auxiliaries, + reference=kldiv_reference + ) + for name, spec in self.actions_spec.items(): + kl_divs[name] = tf.reshape(tensor=kl_divs[name], shape=(-1,)) + kl_div = tf.math.reduce_mean(input_tensor=kl_divs[name], axis=0) + if summarizer is not None: + dependencies.append(tf.summary.scalar( + name=('kl-divergences/' + name), data=kl_div, step=self.updates + )) + dependencies.extend(self.track( + label='kl-divergence', name=('kl-divergences/' + name), data=kl_div + )) + kl_div = tf.concat(values=tuple(kl_divs.values()), axis=0) + else: + kl_div = self.policy.kl_divergence( + states=policy_states, horizons=policy_horizons, + internals=policy_only_internals, auxiliaries=auxiliaries, + reference=kldiv_reference + ) + kl_div = tf.math.reduce_mean(input_tensor=kl_div, axis=0) + if summarizer is not None: + dependencies.append( + tf.summary.scalar(name='kl-divergence', data=kl_div, step=self.updates) + ) + dependencies.extend( + self.track(label='kl-divergence', name='kl-divergence', data=kl_div) + ) + if summarizer is not None: + summarizer.__exit__(None, None, None) + + # Increment update + with tf.control_dependencies(control_inputs=dependencies): + assignment = self.updates.assign_add(delta=one, read_value=False) + + with tf.control_dependencies(control_inputs=(assignment,)): + dependencies = list() + + # Variables summaries + if self.summaries == 'all' or 'variables' in self.summaries: + with self.summarizer.as_default(): + for variable in self.trainable_variables: + name = variable.name + assert name[-2] == ':' + if name.startswith(self.name + '/'): + name = 'variables/' + name[len(self.name) + 1: -2] + else: + name = name[:-2] + x = tf.math.reduce_mean(input_tensor=variable) + dependencies.append(tf.summary.scalar(name=name, data=x, step=self.updates)) + + with tf.control_dependencies(control_inputs=dependencies): + return tf_util.identity(input=optimized) + + def _complete_horizon_values(self, indices, internals, reward): + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + true = tf_util.constant(value=True, dtype='bool') + reward_horizon = self.reward_horizon.value() + reward_discount = self.reward_discount.value() + + # TODO: no need for memory if update episode-based (or not random replay?) + + # Internal values to retrieve, depending on different internals configurations + baseline_internals_values = 'internals/baseline' + if self.predict_action_values and self.separate_baseline: + internals_values = 'internals' + elif self.separate_baseline: + if len(self.internals_spec['baseline']) > 0: + internals_values = 'internals/baseline' + else: + internals_values = None + else: + if len(self.internals_spec['policy']) > 0: + internals_values = 'internals/policy' + baseline_internals_values = 'internals/policy' + else: + internals_values = None + + if self.baseline.max_past_horizon(on_policy=False) == 0: + # Horizons indexing tensor + batch_size = tf_util.cast(x=tf.shape(input=indices)[0], dtype='int') + starts = tf.range(batch_size) + lengths = tf.ones_like(input=indices) + horizons = tf.stack(values=(starts, lengths), axis=1) + + # TODO: remove restriction + if self.predict_action_values and self.separate_baseline: + assert self.policy.max_past_horizon(on_policy=False) == 0 + + # Retrieve horizon values from memory + values = ('states', 'auxiliaries', 'terminal') + if internals_values is not None: + values += (internals_values,) + offsets, values = self.memory.successors( + indices=indices, horizon=reward_horizon, sequence_values=(), final_values=values + ) + states = values['states'] + policy_internals = values.get('internals/policy') + baseline_internals = values.get(baseline_internals_values, TensorDict()) + auxiliaries = values['auxiliaries'] + terminal = values['terminal'] + + # -1 since successors length >= 1 + offsets = offsets - one + + else: + baseline_horizon = self.baseline.past_horizon(on_policy=False) + assertions = list() + if self.config.create_tf_assertions and self.predict_action_values: + policy_horizon = self.policy.past_horizon(on_policy=False) + # TODO: remove restriction + assertions.append(tf.debugging.assert_equal( + x=policy_horizon, y=baseline_horizon, + message="Policy and baseline cannot depend on a different number of " + "previous states if predict_action_values is True." + )) + + with tf.control_dependencies(control_inputs=assertions): + # (Tried to do this more efficiently by differentiating between + # reward horizon >/=/< baseline horizon, but gets too complex since + # it needs to take into account episode start/end edge cases.) + + # Retrieve horizon values from memory + offsets, values = self.memory.successors( + indices=indices, horizon=reward_horizon, sequence_values=(), + final_values=('auxiliaries', 'terminal') + ) + auxiliaries = values['auxiliaries'] + terminal = values['terminal'] + + # -1 since successors length >= 1 + offsets = offsets - one + + # Retrieve baseline states sequence and initial internals from memory + if internals_values is None: + horizons, sequence_values = self.memory.predecessors( + indices=(indices + offsets), horizon=baseline_horizon, + sequence_values=('states',), initial_values=() + ) + policy_internals = None + baseline_internals = TensorDict() + else: + horizons, sequence_values, initial_values = self.memory.predecessors( + indices=indices, horizon=(baseline_horizon - reward_horizon), + sequence_values=('states',), initial_values=(internals_values,) + ) + policy_internals = initial_values.get('internals/policy') + baseline_internals = initial_values.get(baseline_internals_values, TensorDict()) + states = sequence_values['states'] + + # Predict horizon values + if self.predict_action_values: + actions, _ = self.policy.act( + states=states, horizons=horizons, internals=policy_internals, + auxiliaries=auxiliaries, deterministic=true, independent=True + ) + horizon_values = self.baseline.action_value( + states=states, horizons=horizons, internals=baseline_internals, + auxiliaries=auxiliaries, actions=actions + ) + else: + horizon_values = self.baseline.state_value( + states=states, horizons=horizons, internals=baseline_internals, + auxiliaries=auxiliaries + ) + + # Value horizon assertions + assertions = list() + if self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_greater_equal(x=offsets, y=zero)) + if self.baseline.max_past_horizon(on_policy=False) == 0: + baseline_horizon = self.baseline.past_horizon(on_policy=False) + assertions.append(tf.debugging.assert_less_equal(x=offsets, y=reward_horizon)) + + # Add appropriately discounted horizon values to reward + with tf.control_dependencies(control_inputs=assertions): + # Pow numerically stable since 0.0 <= discount <= 1.0 + discounts = tf.math.pow(x=reward_discount, y=tf_util.cast(x=offsets, dtype='float')) + if not self.predict_terminal_values: + is_terminal = tf.math.equal(x=terminal, y=one) + zeros = tf.zeros_like(input=discounts) + discounts = tf.where(condition=is_terminal, x=zeros, y=discounts) + + return reward + discounts * horizon_values + + @tf_function(num_args=7) + def loss(self, *, states, horizons, internals, auxiliaries, actions, reward, reference): + if self.baseline_optimizer is None: + policy_internals = internals['policy'] + else: + policy_internals = internals + if self.baseline_objective is not None and self.baseline_loss_weight is not None and \ + not self.baseline_loss_weight.is_constant(value=0.0): + policy_reference = reference['policy'] + else: + policy_reference = reference + + # Loss per instance + loss = self.objective.loss( + states=states, horizons=horizons, internals=policy_internals, auxiliaries=auxiliaries, + actions=actions, reward=reward, reference=policy_reference, policy=self.policy, + baseline=(self.baseline if self.separate_baseline else None) + ) + + # Objective loss + loss = tf.math.reduce_mean(input_tensor=loss, axis=0) + dependencies = self.summary( + label='loss', name='losses/policy-objective-loss', data=loss, step='updates' + ) + dependencies.extend(self.track(label='loss', name='policy-objective-loss', data=loss)) + + # Regularization losses + regularization_loss = self.regularize( + states=states, horizons=horizons, internals=policy_internals, auxiliaries=auxiliaries + ) + dependencies.extend(self.summary( + label='loss', name='losses/policy-regularization-loss', data=regularization_loss, + step='updates' + )) + dependencies.extend( + self.track(label='loss', name='policy-regularization-loss', data=regularization_loss) + ) + loss += regularization_loss + + # Baseline loss + if self.baseline_loss_weight is not None and \ + not self.baseline_loss_weight.is_constant(value=0.0): + if self.separate_baseline: + baseline_internals = internals['baseline'] + else: + baseline_internals = policy_internals + if self.baseline_objective is not None: + baseline_reference = reference['baseline'] + else: + baseline_reference = policy_reference + + zero = tf_util.constant(value=0.0, dtype='float') + baseline_loss_weight = self.baseline_loss_weight.value() + + def no_baseline_loss(): + return zero + + def apply_baseline_loss(): + baseline_loss = self.baseline_loss( + states=states, horizons=horizons, internals=baseline_internals, + auxiliaries=auxiliaries, actions=actions, reward=reward, + reference=baseline_reference + ) + return baseline_loss_weight * baseline_loss + + loss += tf.cond( + pred=tf.math.equal(x=baseline_loss_weight, y=zero), + true_fn=no_baseline_loss, false_fn=apply_baseline_loss + ) + + dependencies.extend(self.summary( + label='loss', name='losses/policy-loss', data=loss, step='updates' + )) + dependencies.extend(self.track(label='loss', name='policy-loss', data=loss)) + + with tf.control_dependencies(control_inputs=dependencies): + return tf_util.identity(input=loss) + + @tf_function(num_args=4, overwrites_signature=True) + def regularize(self, *, states, horizons, internals, auxiliaries): + regularization_loss = super().regularize() + + # Entropy regularization + if not self.entropy_regularization.is_constant(value=0.0): + zero = tf_util.constant(value=0.0, dtype='float') + entropy_regularization = self.entropy_regularization.value() + + def no_entropy_regularization(): + return zero + + def apply_entropy_regularization(): + entropy = self.policy.entropy( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + entropy = tf.math.reduce_mean(input_tensor=entropy, axis=0) + return -entropy_regularization * entropy + + regularization_loss += tf.cond( + pred=tf.math.equal(x=entropy_regularization, y=zero), + true_fn=no_entropy_regularization, false_fn=apply_entropy_regularization + ) + + return regularization_loss + + @tf_function(num_args=7) + def baseline_loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference + ): + # Loss per instance + loss = self.baseline_objective.loss( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, reward=reward, reference=reference, policy=self.baseline + ) + + # Objective loss + loss = tf.math.reduce_mean(input_tensor=loss, axis=0) + + dependencies = list() + if self.separate_baseline: + dependencies.extend(self.summary( + label='loss', name='losses/baseline-objective-loss', data=loss, step='updates' + )) + dependencies.extend( + self.track(label='loss', name='baseline-objective-loss', data=loss) + ) + + # Regularization losses + regularization_loss = self.baseline.regularize() + dependencies.extend(self.summary( + label='loss', name='losses/baseline-regularization-loss', + data=regularization_loss, step='updates' + )) + dependencies.extend(self.track( + label='loss', name='baseline-regularization-loss', data=regularization_loss + )) + loss += regularization_loss + + dependencies.extend(self.summary( + label='loss', name='losses/baseline-loss', data=loss, step='updates' + )) + dependencies.extend(self.track(label='loss', name='baseline-loss', data=loss)) + + with tf.control_dependencies(control_inputs=dependencies): + return tf_util.identity(input=loss) diff --git a/tensorforce/core/module.py b/tensorforce/core/module.py new file mode 100644 index 000000000..d302617ff --- /dev/null +++ b/tensorforce/core/module.py @@ -0,0 +1,788 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import functools +import json +import os +import re + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError, util +import tensorforce.core +from tensorforce.core import SignatureDict, TensorSpec, tf_util, VariableDict + + +def make_key(*, x): + try: + hash(x) + if x is not None: + x < x + return x + except TypeError as exc: + if isinstance(x, tuple) and len(x) > 0 and all(isinstance(y, tf.Variable) for y in x): + return tuple(y.name for y in x) + elif isinstance(x, Module): + return x.__class__.__name__ + elif hasattr(x, '__name__'): + return x.__name__ + else: + raise exc + + +def tf_function( + *, num_args, optional=0, api_function=False, overwrites_signature=False, is_loop_body=False, + dict_interface=False +): + + def decorator(function): + + def decorated(self, *args, _initialize=False, **kwargs): + assert api_function or not _initialize + assert len(args) == 0 or len(kwargs) == 0 + assert len(args) == 0 or len(args) == num_args + + # Function name and qualname + name = function.__name__ + qualname = function.__qualname__ + + # Parameters-to-graph mapping + if not hasattr(self, '_{name}_graphs'.format(name=name)): + setattr(self, '_{name}_graphs'.format(name=name), dict()) + assert function.__qualname__.endswith('.' + name) + setattr(self, '_{name}_qualname'.format(name=name), function.__qualname__) + function_graphs = getattr(self, '_{name}_graphs'.format(name=name)) + qualname = getattr(self, '_{name}_qualname'.format(name=name)) + + # Handle overwriting signature + if overwrites_signature: + setattr(self, '_{name}_overwritten'.format(name=name), overwrites_signature) + overwritten = getattr(self, '_{name}_overwritten'.format(name=name), False) + + # Graph signature + input_signature = self.input_signature(function=name) + output_signature = self.output_signature(function=name) + + # Apply raw function if qualname mismatch, which indicates super() call + if function.__qualname__ != qualname: + assert not _initialize + if not overwritten: + assert num_args - optional <= input_signature.num_args() <= num_args + return function(self, *args, **kwargs) + + # Check number of arguments + assert num_args - optional <= input_signature.num_args() <= num_args + + # Graph parameters + params_kwargs = { + key: arg for key, arg in kwargs.items() if key not in input_signature + } + graph_params = tuple(make_key(x=arg) for arg in params_kwargs.values()) + + # Check whether output_signature is parametrized + if not isinstance(output_signature, SignatureDict): + output_signature = output_signature(**params_kwargs) + + # Function graph + if str(graph_params) not in function_graphs: + assert not api_function or _initialize + + def function_graph(*args): + with self: + # TODO: tf.name_scope instead? + kwargs = input_signature.args_to_kwargs(args=args, from_dict=dict_interface) + args = function(self, **kwargs.to_kwargs(), **params_kwargs) + args = output_signature.kwargs_to_args(kwargs=args, to_dict=dict_interface) + return args + + function_graph.__name__ = name + function_graph.__qualname__ = qualname + + function_graphs[str(graph_params)] = tf.function( + func=function_graph, + input_signature=input_signature.to_list(to_dict=dict_interface), + autograph=False + # experimental_implements=None, experimental_autograph_options=None, + # experimental_relax_shapes=False, experimental_compile=None + ) + + # Do not call function if initialization + if _initialize: + return + + # Graph arguments + if len(kwargs) > 0: + graph_args = input_signature.kwargs_to_args( + kwargs=kwargs, to_dict=dict_interface, outer_tuple=True + ) + else: + graph_args = args + + # Apply function graph + with self: + output_args = function_graphs[str(graph_params)](*graph_args) + if not is_loop_body: + return output_signature.args_to_kwargs( + args=output_args, outer_tuple=True, from_dict=dict_interface + ) + else: + return output_args + + return decorated + + return decorator + + +class Module(tf.Module): + """ + Base class for modules. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + """ + + _TF_MODULE_IGNORED_PROPERTIES = \ + tf.Module._TF_MODULE_IGNORED_PROPERTIES | {'_MODULE_STACK', 'parent'} + + # _MODULE_STACK # Initialized as part of model.__init__() + + def __init__(self, *, device=None, l2_regularization=None, name=None): + name = name.replace('/', '_') + super().__init__(name=name) + + self.checkpoint = None + self.is_trainable = None + self.is_saved = None + self.is_initialized = None + + assert len(Module._MODULE_STACK) >= 1 + if isinstance(Module._MODULE_STACK[-1], type): + assert isinstance(self, Module._MODULE_STACK[-1]) + else: + # Not always type, e.g. tf_optimizer uses functools.partial + assert isinstance(Module._MODULE_STACK[-1], functools.partial) and \ + isinstance(self, Module._MODULE_STACK[-1].func) + Module._MODULE_STACK[-1] = self + if len(Module._MODULE_STACK) > 1: + self.parent = Module._MODULE_STACK[-2] + else: + self.parent = None + + # Device + if device is None: + self.device = util.NullContext() + else: + self.device = tf.device(device_name=device) + + # L2 regularization + if l2_regularization is None: + self.l2_regularization = None + else: + self.l2_regularization = self.submodule( + name='l2_regularization', module=l2_regularization, + modules=tensorforce.core.parameter_modules, is_trainable=False, dtype='float', + min_value=0.0 + ) + + @property + def root(self): + return self.parent.root + + @property + def config(self): + return self.parent.config + + @property + def full_name(self): + return '{}/{}'.format(self.parent.full_name, self.name) + + @property + def tensorforce_submodules(self): + predicate = (lambda x: isinstance(x, Module)) + return list(self._flatten(recursive=True, predicate=predicate)) + + @property + def this_submodules(self): + predicate = (lambda x: isinstance(x, tf.Module)) + return list(self._flatten(recursive=False, predicate=predicate)) + + @property + def this_trainable_variables(self): + predicate = (lambda x: isinstance(x, tf.Variable) and getattr(x, 'trainable', False)) + return list(self._flatten(recursive=False, predicate=predicate)) + + @property + def this_tensorforce_submodules(self): + predicate = (lambda x: isinstance(x, Module)) + return list(self._flatten(recursive=True, predicate=predicate)) + + # @property + # def trainable_variables(self): + # predicate = (lambda x: isinstance(x, tf.Variable) and getattr(x, 'trainable', False)) + # variables = list(self._flatten(recursive=False, predicate=predicate)) + # for module in self.this_submodules: + # # if not isinstance(module, Module) or module.is_trainable: + # variables.extend(module.trainable_variables) + # return variables + + @property + def saved_variables(self): + predicate = (lambda x: isinstance(x, tf.Variable) and getattr(x, 'is_saved', True)) + variables = list(self._flatten(recursive=False, predicate=predicate)) + for module in self.this_submodules: + if not isinstance(module, Module): + variables.extend(module.variables) + elif module.is_saved: + variables.extend(module.saved_variables) + return variables + + def __enter__(self): + Module._MODULE_STACK.append(self) + self.device.__enter__() + assert isinstance(self.is_initialized, bool) + if self.is_initialized: + self.name_scope.__enter__() + else: + self._proper_name_scope = tf.name_scope(name=self.name) + self._proper_name_scope.__enter__() + return self + + def __exit__(self, etype, exception, traceback): + if self.is_initialized: + self.name_scope.__exit__(etype, exception, traceback) + else: + self._proper_name_scope.__exit__(etype, exception, traceback) + self.device.__exit__(etype, exception, traceback) + popped = Module._MODULE_STACK.pop() + assert popped is self + + def initialize(self): + self.summary_steps = VariableDict() + self.tracking_variables = VariableDict() + assert self.is_initialized is False + for module in self.this_submodules: + if isinstance(module, Module): + assert module.is_initialized is None + module.is_initialized = False + with module: + module.initialize() + assert module.is_initialized is False + module.is_initialized = True + + def save(self, *, directory, filename=None): + if filename is None: + filename = self.full_name.replace('/', '.') + if self.checkpoint is None: + self.checkpoint = tf.train.Checkpoint(**{self.name: self}) + return self.checkpoint.write(file_prefix=os.path.join(directory, filename)) + + def restore(self, *, directory, filename=None): + if filename is None: + filename = self.full_name.replace('/', '.') + if self.checkpoint is None: + self.checkpoint = tf.train.Checkpoint(**{self.name: self}) + try: + self.checkpoint.restore(save_path=os.path.join(directory, filename)).expect_partial() + except AssertionError as exc: + if len(exc.args) != 1 or not re.match( + pattern=r"Some Python objects were not bound to checkpointed values, likely due to " + r"changes in the Python program: \[(, )*\]", + string=exc.args[0] + ): + raise exc + + def input_signature(self, *, function): + if function == 'regularize': + return SignatureDict() + + else: + raise NotImplementedError + + def output_signature(self, *, function): + if function == 'regularize': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + else: + raise NotImplementedError + + @tf_function(num_args=0) + def regularize(self): + zero = tf_util.constant(value=0.0, dtype='float') + + module = self + while module.l2_regularization is None: + module = module.parent + + if len(self.this_trainable_variables) == 0 or \ + module.l2_regularization.is_constant(value=0.0): + regularization_loss = zero + + else: + l2_regularization = module.l2_regularization.value() + + def no_l2_regularization(): + return zero + + def apply_l2_regularization(): + l2_variables = list() + for variable in self.this_trainable_variables: + variable = tf_util.cast(x=variable, dtype='float') + l2_variables.append(tf.reduce_sum(input_tensor=tf.square(x=variable))) + return l2_regularization * tf.math.add_n(inputs=l2_variables) + + skip_l2_regularization = tf.math.equal(x=l2_regularization, y=zero) + regularization_loss = tf.cond( + pred=skip_l2_regularization, true_fn=no_l2_regularization, + false_fn=apply_l2_regularization + ) + + for module in self.this_submodules: + if isinstance(module, Module) and module.is_trainable: + regularization_loss += module.regularize() + + return regularization_loss + + @staticmethod + def get_module_class_and_args( + *, name, module=None, modules=None, default_module=None, disable_first_arg=False, **kwargs + ): + # name + if not isinstance(name, str): + raise TensorforceError.type(name='Module.add_module', argument='name', dtype=type(name)) + # modules + if modules is not None and not isinstance(modules, dict): + raise TensorforceError.type( + name='Module.add_module', argument='modules', dtype=type(modules) + ) + + # default_module + if default_module is not None and default_module not in modules and \ + not issubclass(default_module, Module): + raise TensorforceError.value( + name='Module.add_module', argument='default_module', value=default_module + ) + + # disable_first_arg + if not isinstance(disable_first_arg, bool): + raise TensorforceError.type( + name='Module.add_module', argument='disable_first_arg', + dtype=type(disable_first_arg) + ) + + # module + if isinstance(module, dict): + # Dictionary module specification (type either given via 'type' or 'default_module') + util.deep_disjoint_update(target=kwargs, source=module) + module = kwargs.pop('type', default_module) + return Module.get_module_class_and_args( + name=name, module=module, modules=modules, default_module=default_module, + disable_first_arg=True, **kwargs + ) + + elif isinstance(module, str): + if os.path.isfile(module): + # JSON file module specification + with open(module, 'r') as fp: + module = json.load(fp=fp) + return Module.get_module_class_and_args( + name=name, module=module, modules=modules, default_module=default_module, + disable_first_arg=True, **kwargs + ) + + elif modules is not None and module in modules: + # Keyword module specification + return Module.get_module_class_and_args( + name=name, module=modules[module], modules=modules, + default_module=default_module, disable_first_arg=True, **kwargs + ) + + else: + # Library module specification + assert modules is not None + parent_class = next(iter(modules.values())) + while len(parent_class.mro()) >= 4 and parent_class.mro()[1] != Module: + parent_class = parent_class.mro()[1] + _module = util.try_import_module(module=module, parent_class=parent_class) + if _module is not None: + return Module.get_module_class_and_args( + name=name, module=_module, modules=modules, default_module=default_module, + disable_first_arg=True, **kwargs + ) + + if 'default' in modules or default_module is not None: + # Default module specification + if '_first_arg' in kwargs: + raise TensorforceError.invalid( + name='Module.add_module', argument='_first_arg' + ) + if module is not None: + if disable_first_arg: + raise TensorforceError.value( + name='Module.add_module', argument='module', value=module + ) + kwargs['_first_arg'] = module + if default_module is None: + default_module = modules['default'] + return Module.get_module_class_and_args( + name=name, module=default_module, modules=modules, **kwargs + ) + + else: + raise TensorforceError.value( + name='Module.add_module', argument='module', value=module + ) + + elif (not callable(module) or isinstance(module, tf.keras.Model) or ( + isinstance(module, type) and issubclass(module, tf.keras.Model) + )) and ('default' in modules or default_module is not None): + # Default module specification + if '_first_arg' in kwargs: + raise TensorforceError.invalid(name='Module.add_module', argument='_first_arg') + if module is not None: + kwargs['_first_arg'] = module + if default_module is None: + default_module = modules['default'] + return Module.get_module_class_and_args( + name=name, module=default_module, modules=modules, **kwargs + ) + + elif callable(module): + if '_first_arg' in kwargs: + args = (kwargs.pop('_first_arg'),) + else: + args = () + kwargs['name'] = name + return module, args, kwargs + + else: + raise TensorforceError.value(name='Module.add_module', argument='module', value=module) + + def submodule( + self, *, name, module=None, modules=None, default_module=None, is_trainable=True, + is_saved=True, **kwargs + ): + assert self.is_initialized is None + + # name + if any(name == module.name for module in self.this_submodules): + raise TensorforceError.exists(name='module', value=name) + + # is_trainable + if not isinstance(is_trainable, bool): + raise TensorforceError.type( + name='Module.add_module', argument='is_trainable', dtype=type(is_trainable) + ) + + # is_saved + if not isinstance(is_saved, bool): + raise TensorforceError.type( + name='Module.add_module', argument='is_saved', dtype=type(is_saved) + ) + + # module, modules, default_module + module_cls, args, kwargs = Module.get_module_class_and_args( + name=name, module=module, modules=modules, default_module=default_module, **kwargs + ) + + # Module constructor + Module._MODULE_STACK.append(module_cls) + module = module_cls(*args, **kwargs) + popped = Module._MODULE_STACK.pop() + assert popped is module + + assert module.is_trainable is None + module.is_trainable = is_trainable + assert module.is_saved is None + module.is_saved = is_saved + + return module + + def variable( + self, *, name, spec, initializer, is_trainable, is_saved, initialization_scale=None + ): + assert self.is_initialized is False + # name + if not isinstance(name, str): + raise TensorforceError.type(name='variable', argument='name', dtype=type(name)) + name = name.replace('/', '_') + # spec + if not isinstance(spec, TensorSpec): + raise TensorforceError.type(name='variable', argument='spec', dtype=type(spec)) + if spec.is_underspecified(): + raise TensorforceError.value( + name='variable', argument='spec', value=spec, hint='underspecified' + ) + # initializer + initializer_names = ( + 'constant', 'normal', 'normal-relu', 'ones', 'orthogonal', 'orthogonal-relu', 'zeros' + ) + if not isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) and \ + initializer not in initializer_names: + raise TensorforceError.value(name='variable', argument='initializer', value=initializer) + elif isinstance(initializer, np.ndarray) and initializer.dtype != spec.np_type(): + raise TensorforceError.type( + name='variable', argument='initializer', dtype=initializer.dtype + ) + elif isinstance(initializer, tf.Tensor) and tf_util.dtype(x=initializer) != spec.tf_type(): + raise TensorforceError.type( + name='variable', argument='initializer', dtype=tf_util.dtype(x=initializer) + ) + # initialization_scale + if initialization_scale is not None: + if isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) or \ + initializer not in ('constant', 'orthogonal', 'orthogonal-relu'): + raise TensorforceError.invalid( + name='variable', argument='initialization_scale', + condition='initializer not orthogonal' + ) + elif not isinstance(initialization_scale, spec.py_type()): + raise TensorforceError.type( + name='variable', argument='initialization_scale', + dtype=type(initialization_scale), hint='!= float' + ) + # is_trainable + if not isinstance(is_trainable, bool): + raise TensorforceError.type( + name='variable', argument='is_trainable', dtype=type(is_trainable) + ) + elif is_trainable and spec.type != 'float': + raise TensorforceError.value( + name='variable', argument='is_trainable', value=is_trainable, + condition='spec.type != float' + ) + # is_saved + if not isinstance(is_saved, bool): + raise TensorforceError.type(name='variable', argument='is_saved', dtype=type(is_saved)) + + # Variable initializer + if isinstance(initializer, spec.py_type()): + initializer = tf_util.constant(value=initializer, dtype=spec.type, shape=spec.shape) + elif isinstance(initializer, np.ndarray): + if initializer.shape != spec.shape: + raise TensorforceError.mismatch( + name='Module.variable', value1='shape', value2='initializer' + ) + initializer = tf_util.constant(value=initializer, dtype=spec.type) + elif isinstance(initializer, tf.Tensor): + if tf_util.shape(x=initializer) != spec.shape: + raise TensorforceError.mismatch( + name='Module.variable', value1='shape', value2='initializer' + ) + initializer = initializer + elif not isinstance(initializer, str): + raise TensorforceError("Invalid variable initializer: {}".format(initializer)) + elif initializer.startswith('normal'): + if spec.type != 'float': + raise TensorforceError( + message="Invalid variable initializer value for non-float variable: {}.".format( + initializer + ) + ) + if initializer.endswith('-relu'): + stddev = min(0.1, np.sqrt(2.0 / util.product(xs=spec.shape[:-1]))) + else: + stddev = min(0.1, np.sqrt(2.0 / (util.product(xs=spec.shape[:-1]) + spec.shape[-1]))) + initializer = tf.random.normal(shape=spec.shape, stddev=stddev, dtype=spec.tf_type()) + elif initializer.startswith('orthogonal'): + if spec.type != 'float': + raise TensorforceError( + message="Invalid variable initializer value for non-float variable: {}.".format( + initializer + ) + ) + if spec.rank < 2: + raise TensorforceError( + message="Invalid variable initializer value for 0/1-rank variable: {}.".format( + initializer + ) + ) + normal = np.random.normal(size=(util.product(xs=spec.shape[:-1]), spec.shape[-1])) + u, _, v = np.linalg.svd(a=normal, full_matrices=False) + orthogonal = u if u.shape[1] == spec.shape[-1] else v + if initializer.endswith('-relu'): + orthogonal = orthogonal * np.sqrt(2.0) + if initialization_scale is not None and initialization_scale != 1.0: + if initialization_scale <= 0.0: + raise TensorforceError.value( + name='variable', argument='initialization_scale', + value=initialization_scale, hint='<= 0.0' + ) + orthogonal = orthogonal * initialization_scale + initializer = tf_util.constant(value=orthogonal.reshape(spec.shape), dtype=spec.type) + elif initializer == 'zeros': + initializer = tf_util.zeros(shape=spec.shape, dtype=spec.type) + elif initializer == 'ones': + initializer = tf_util.ones(shape=spec.shape, dtype=spec.type) + elif initializer == 'constant': + initializer = tf.fill( + dims=spec.shape, value=tf_util.constant(value=initialization_scale, dtype=spec.type) + ) + + # Variable + variable = tf.Variable( + initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name, + dtype=spec.tf_type(), shape=spec.shape + ) + variable.is_saved = is_saved + + return variable + + def register_summary(self, *, label, name): + # label + if not isinstance(label, str): + raise TensorforceError.type( + name='Module.register_summary', argument='label', dtype=type(label) + ) + # name + if not isinstance(name, (str, tuple, list)): + raise TensorforceError.type( + name='Module.register_summary', argument='name', dtype=type(name) + ) + if not isinstance(name, str): + name = name[0] + if name in self.summary_steps: + raise TensorforceError.value( + name='Module.register_summary', argument='name', hint='already exists' + ) + + if self.root.summaries == 'all' or label in self.root.summaries: + self.summary_steps[name] = self.variable( + name=(name + '-summary-step'), spec=TensorSpec(type='int'), initializer=-1, + is_trainable=False, is_saved=False + ) + + def register_tracking(self, *, label, name, spec): + # label + if not isinstance(label, str): + raise TensorforceError.type( + name='Module.register_tracking', argument='label', dtype=type(label) + ) + # name + if not isinstance(name, str): + raise TensorforceError.type( + name='Module.register_tracking', argument='name', dtype=type(name) + ) + if name in self.tracking_variables: + raise TensorforceError.value( + name='Module.register_tracking', argument='name', hint='already exists' + ) + # spec + if not isinstance(spec, TensorSpec): + raise TensorforceError.type( + name='Module.register_tracking', argument='spec', dtype=type(spec) + ) + + if self.root.tracking == 'all' or label in self.root.tracking: + self.tracking_variables[name] = self.variable( + name=(name + '-tracking'), spec=spec, initializer='zeros', is_trainable=False, + is_saved=False + ) + + def summary(self, *, label, name, data, step): + # label + if not isinstance(label, str): + raise TensorforceError.type(name='Module.summary', argument='label', dtype=type(label)) + # name + if not isinstance(name, (str, tuple, list)): + raise TensorforceError.type(name='Module.summary', argument='name', dtype=type(name)) + if isinstance(name, str): + names = None + else: + names = name + name = name[0] + # data + if not tf_util.is_tensor(x=data) and not callable(data): + raise TensorforceError.type(name='Module.summary', argument='data', dtype=type(data)) + # step + if step not in self.root.units: + raise TensorforceError.value(name='Module.summary', argument='step', value=step) + + if self.root.summaries == 'all' or label in self.root.summaries: + if name not in self.summary_steps: + raise TensorforceError.value( + name='Module.summary', argument='name', value=name, hint='is not registered' + ) + + unit = self.root.units[step] + + def fn_summary(): + if callable(data): + value = data() + else: + value = data + dependencies = list() + with self.root.summarizer.as_default(): + if names is None: + dependencies.append(tf.summary.scalar(name=name, data=value, step=unit)) + else: + for n, x in zip(names, value): + dependencies.append(tf.summary.scalar(name=n, data=x, step=unit)) + previous = self.summary_steps[name] + dependencies.append(previous.assign(value=unit, read_value=False)) + return tf.group(*dependencies) + + pred = unit > self.summary_steps[name] + return [tf.cond(pred=pred, true_fn=fn_summary, false_fn=tf.no_op)] + + else: + return list() + + def track(self, *, label, name, data): + # label + if not isinstance(label, str): + raise TensorforceError.type(name='Module.track', argument='label', dtype=type(label)) + # name + if not isinstance(name, str): + raise TensorforceError.type(name='Module.track', argument='name', dtype=type(name)) + # data + if not tf_util.is_tensor(x=data) and not callable(data): + raise TensorforceError.type(name='Module.track', argument='data', dtype=type(data)) + + if self.root.tracking == 'all' or label in self.root.tracking: + if name not in self.tracking_variables: + raise TensorforceError.value( + name='Module.track', argument='name', value=name, hint='is not registered' + ) + + if callable(data): + value = data() + else: + value = data + assignment = self.tracking_variables[name].assign(value=value) + return [assignment] + + else: + return list() + + def tracked_tensors(self): + modules = [self] + modules.extend(self.tensorforce_submodules) + tracked_tensors = dict() + for module in modules: + for name, value in module.tracking_variables.items(): + value = value.numpy() + if value.shape == (): + value = value.item() + tracked_tensors[module.full_name + '/' + name] = value + return tracked_tensors diff --git a/tensorforce/core/networks/__init__.py b/tensorforce/core/networks/__init__.py index 7142b1895..b4c83eb64 100755 --- a/tensorforce/core/networks/__init__.py +++ b/tensorforce/core/networks/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,48 +13,21 @@ # limitations under the License. # ============================================================================== -from tensorforce.core.networks.layer import Layer, TFLayer, Nonlinearity, Dropout, Flatten, Pool2d, Embedding, Linear, Dense, \ - Dueling, Conv1d, Conv2d, InternalLstm, Lstm -from tensorforce.core.networks.network import Network, LayerBasedNetwork, LayeredNetwork -from tensorforce.core.networks.complex_network import Input, Output +from tensorforce.core.networks.network import Network, LayerbasedNetwork, LayeredNetwork +# Require Network/LayerbasedNetwork +from tensorforce.core.networks.auto import AutoNetwork +from tensorforce.core.networks.keras import KerasNetwork +from tensorforce.core.networks.preprocessor import Preprocessor -layers = dict( - tf_layer=TFLayer, - nonlinearity=Nonlinearity, - dropout=Dropout, - flatten=Flatten, - pool2d=Pool2d, - embedding=Embedding, - linear=Linear, - dense=Dense, - dueling=Dueling, - conv1d=Conv1d, - conv2d=Conv2d, - internal_lstm=InternalLstm, - lstm=Lstm, - input=Input, - output=Output + +network_modules = dict( + auto=AutoNetwork, custom=LayeredNetwork, default=LayeredNetwork, keras=KerasNetwork, + layered=LayeredNetwork ) __all__ = [ - 'layers', - 'Layer', - 'TFLayer', - 'Nonlinearity', - 'Dropout', - 'Flatten', - 'Pool2d', - 'Embedding', - 'Linear', - 'Dense', - 'Dueling', - 'Conv1d', - 'Conv2d', - 'InternalLstm', - 'Lstm', - 'Network', - 'LayerBasedNetwork', - 'LayeredNetwork' + 'AutoNetwork', 'LayerbasedNetwork', 'KerasNetwork', 'LayeredNetwork', 'Network', + 'network_modules', 'Preprocessor' ] diff --git a/tensorforce/core/networks/auto.py b/tensorforce/core/networks/auto.py new file mode 100644 index 000000000..6a445b051 --- /dev/null +++ b/tensorforce/core/networks/auto.py @@ -0,0 +1,177 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import logging + +from tensorforce import TensorforceError +from tensorforce.core.networks import LayeredNetwork + + +class AutoNetwork(LayeredNetwork): + """ + Network whose architecture is automatically configured based on input types and shapes, + offering high-level customization (specification key: `auto`). + + Args: + size (int > 0): Layer size, before concatenation if multiple states + (default: 64). + depth (int > 0): Number of layers per state, before concatenation if multiple states + (default: 2). + final_size (int > 0): Layer size after concatenation if multiple states + (default: layer size). + final_depth (int > 0): Number of layers after concatenation if multiple states + (default: 1). + rnn (false | [parameter](../modules/parameters.html), int >= 0): Whether to add an LSTM cell + with internal state as last layer, and if so, horizon of the LSTM for truncated + backpropagation through time + (default: false). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + inputs_spec (specification): internal use. + outputs (iter[string]): internal use. + """ + + def __init__( + self, *, size=64, depth=2, final_size=None, final_depth=1, rnn=False, device=None, + l2_regularization=None, name=None, inputs_spec=None, outputs=None, + # Deprecated + internal_rnn=None + ): + if internal_rnn is not None: + raise TensorforceError.deprecated( + name='AutoNetwork', argument='internal_rnn', replacement='rnn' + ) + + if len(inputs_spec) == 1: + if final_size is not None: + raise TensorforceError.invalid( + name='AutoNetwork', argument='final_size', condition='input size = 1' + ) + if final_depth is not None and final_depth != 1: + raise TensorforceError.invalid( + name='AutoNetwork', argument='final_depth', condition='input size = 1' + ) + + if len(inputs_spec) > 8: + logging.warning("Large number of state components {} which may cause poor performance, " + "consider merging components where possible.".format(len(inputs_spec))) + + if outputs is not None: + raise TensorforceError.invalid( + name='policy', argument='single_output', condition='AutoNetwork' + ) + + if final_size is None: + final_size = size + if final_depth is None: + final_depth = 0 + + layers = list() + for input_name, spec in inputs_spec.items(): + if len(inputs_spec) == 1: + state_layers = layers + else: + state_layers = list() + layers.append(state_layers) + + # Retrieve input state + if input_name is None: + prefix = '' + else: + prefix = input_name + '_' + state_layers.append(dict( + type='retrieve', name=(prefix + 'retrieve'), tensors=(input_name,) + )) + + # Embed bool and int states + requires_embedding = (spec.type == 'bool' or spec.type == 'int') + if spec.type == 'int' and spec.num_values is None: + if input_name is None: + raise TensorforceError.required( + name='state', argument='num_values', condition='state type is int' + ) + else: + raise TensorforceError.required( + name=(input_name + ' state'), argument='num_values', + condition='state type is int' + ) + if requires_embedding: + state_layers.append(dict( + type='embedding', name=(prefix + 'embedding'), size=size + )) + + # Shape-specific layer type + if spec.rank == 1 - requires_embedding: + layer = 'dense' + elif spec.rank == 2 - requires_embedding: + layer = 'conv1d' + elif spec.rank == 3 - requires_embedding: + layer = 'conv2d' + elif spec.rank == 0: + state_layers.append(dict(type='flatten', name=(prefix + 'flatten'))) + layer = 'dense' + else: + raise TensorforceError.value( + name='AutoNetwork', argument='input rank', value=spec.rank, hint='>= 3' + ) + + # Repeat layer according to depth (one less if embedded) + for n in range(depth - requires_embedding): + state_layers.append(dict( + type=layer, name='{}{}{}'.format(prefix, layer, n), size=size + )) + + # Max pool if rank greater than one + if spec.rank > 1 - requires_embedding: + state_layers.append(dict( + type='pooling', name=(prefix + 'pooling'), reduction='max' + )) + + # Register state-specific embedding + if input_name is not None: + state_layers.append(dict( + type='register', name=(prefix + 'register'), tensor=(input_name + '-embedding') + )) + + # Final combined layers + if len(inputs_spec) == 1: + final_layers = layers + + else: + final_layers = list() + layers.append(final_layers) + + # Retrieve state-specific embeddings + final_layers.append(dict( + type='retrieve', name='retrieve', + tensors=tuple(input_name + '-embedding' for input_name in inputs_spec), + aggregation='concat' + )) + + # Repeat layer according to depth + for n in range(final_depth): + final_layers.append(dict(type='dense', name=('dense' + str(n)), size=final_size)) + + # Rnn + if rnn is not None and rnn is not False: + final_layers.append(dict(type='lstm', name='lstm', size=final_size, horizon=rnn)) + + super().__init__( + layers=layers, device=device, l2_regularization=l2_regularization, name=name, + inputs_spec=inputs_spec, outputs=outputs + ) diff --git a/tensorforce/core/networks/complex_network.py b/tensorforce/core/networks/complex_network.py deleted file mode 100755 index d08f518b5..000000000 --- a/tensorforce/core/networks/complex_network.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from collections import Counter -import json -import os - -import tensorflow as tf - -from tensorforce import TensorForceError -from tensorforce.core.networks import Layer -from tensorforce.core.networks.network import LayerBasedNetwork - - -class Input(Layer): - """ - Input layer. Used for ComplexLayerNetwork's to collect data together - as a form of output to the next layer. Allows for multiple inputs - to merge into a single import for next layer. - """ - - def __init__( - self, - inputs, - axis=1, - scope='merge_inputs', - summary_labels=() - ): - """ - Input layer. - - Args: - inputs: A list of strings that name the inputs to merge - axis: Axis to merge the inputs - - """ - self.inputs = inputs - self.axis = axis - super(Input, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - inputs_to_merge = list() - for name in self.inputs: - # Previous input, by name or "*", like normal network_spec - # Not using named_tensors as there could be unintended outcome - if name == "*" or name == "previous": - inputs_to_merge.append(x) - elif name in self.named_tensors: - inputs_to_merge.append(self.named_tensors[name]) - else: - # Failed to find key in available inputs, print out help to user, raise error - keys = list(self.named_tensors) - raise TensorForceError( - 'ComplexNetwork input "{}" doesn\'t exist, Available inputs: {}'.format(name, keys) - ) - # Review data for casting to more precise format so TensorFlow doesn't throw error for mixed data - # Quick & Dirty cast only promote types: bool=0,int32=10, int64=20, float32=30, double=40 - - cast_type_level = 0 - cast_type_dict = { - 'bool': 0, - 'int32': 10, - 'int64': 20, - 'float32': 30, - 'float64': 40 - } - cast_type_func_dict = { - 0: tf.identity, - 10: tf.to_int32, - 20: tf.to_int64, - 30: tf.to_float, - 40: tf.to_double - } - # Scan inputs for max cast_type - for tensor in inputs_to_merge: - key = str(tensor.dtype.name) - if key in cast_type_dict: - if cast_type_dict[key] > cast_type_level: - cast_type_level = cast_type_dict[key] - else: - raise TensorForceError('Network spec input does not support dtype {}'.format(key)) - - # Add casting if needed - for index, tensor in enumerate(inputs_to_merge): - key = str(tensor.dtype.name) - - if cast_type_dict[key] < cast_type_level: - inputs_to_merge[index] = cast_type_func_dict[cast_type_level](tensor) - - input_tensor = tf.concat(inputs_to_merge, self.axis) - - return input_tensor - - -class Output(Layer): - """ - Output layer. Used for ComplexLayerNetwork's to capture the tensor - under and name for use with Input layers. Acts as a input to output passthrough. - """ - - def __init__( - self, - output, - scope='output', - summary_labels=() - ): - """ - Output layer. - - Args: - output: A string that names the tensor, will be added to available inputs - - """ - self.output = output - super(Output, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - self.named_tensors[self.output] = x - return x - - -class ComplexLayeredNetwork(LayerBasedNetwork): - """ - Complex Network consisting of a sequence of layers, which can be created from a specification dict. - """ - - def __init__(self, complex_layers_spec, scope='layered-network', summary_labels=()): - """ - Complex Layered network. - - Args: - complex_layers_spec: List of layer specification dicts - """ - super(ComplexLayeredNetwork, self).__init__(scope=scope, summary_labels=summary_labels) - self.complex_layers_spec = complex_layers_spec - #self.named_tensors = dict() - - layer_counter = Counter() - - for branch_spec in self.complex_layers_spec: - for layer_spec in branch_spec: - if isinstance(layer_spec['type'], str): - name = layer_spec['type'] - else: - name = 'layer' - scope = name + str(layer_counter[name]) - layer_counter[name] += 1 - - layer = Layer.from_spec( - spec=layer_spec, - kwargs=dict(scope=scope, summary_labels=summary_labels) - ) - # Link named dictionary reference into Layer - layer.tf_tensors(named_tensors=self.named_tensors) - self.add_layer(layer=layer) - - def tf_apply(self, x, internals, update, return_internals=False): - if isinstance(x, dict): - self.named_tensors.update(x) - if len(x) == 1: - x = next(iter(x.values())) - - next_internals = dict() - for layer in self.layers: - layer_internals = {name: internals['{}_{}'.format(layer.scope, name)] for name in layer.internals_spec()} - - if len(layer_internals) > 0: - x, layer_internals = layer.apply(x=x, update=update, **layer_internals) - for name, internal in layer_internals.items(): - next_internals['{}_{}'.format(layer.scope, name)] = internal - - else: - x = layer.apply(x=x, update=update) - - if return_internals: - return x, next_internals - else: - return x - - @staticmethod - def from_json(filename): # TODO: NOT TESTED - """ - Creates a complex_layered_network_builder from a JSON. - - Args: - filename: Path to configuration - - Returns: A ComplexLayeredNetwork class with layers generated from the JSON - """ - path = os.path.join(os.getcwd(), filename) - with open(path, 'r') as fp: - config = json.load(fp=fp) - return ComplexLayeredNetwork(complex_layers_spec=config) diff --git a/tensorforce/core/networks/keras.py b/tensorforce/core/networks/keras.py new file mode 100644 index 000000000..aa2bf35c0 --- /dev/null +++ b/tensorforce/core/networks/keras.py @@ -0,0 +1,113 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.networks import Network + + +class KerasNetwork(Network): + """ + Wrapper class for networks specified as Keras model (specification key: `keras`). + + Args: + model (tf.keras.Model): Keras model + (required). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + inputs_spec (specification): internal use. + outputs (iter[string]): internal use. + kwargs: Arguments for the Keras model. + """ + + def __init__( + self, *, model, device=None, l2_regularization=None, name=None, inputs_spec=None, + outputs=None, **kwargs + ): + if outputs is not None: + raise TensorforceError.invalid( + name='policy', argument='single_output', condition='KerasNetwork' + ) + + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, inputs_spec=inputs_spec, + outputs=outputs + ) + + if isinstance(model, tf.keras.Model): + self.keras_model = model + elif (isinstance(model, type) and issubclass(model, tf.keras.Model)): + self.keras_model = model(**kwargs) + elif callable(model): + self.keras_model = model(**kwargs) + assert isinstance(self.keras_model, tf.keras.Model) + else: + raise TensorforceError.value(name='KerasNetwork', argument='model', value=model) + + # if self.keras_model.inputs is not None: + # assert False + + def get_architecture(self): + return 'KerasNetwork(model={})'.format(self.keras_model.__class__.__name__) + + def output_spec(self): + assert self.keras_model.compute_dtype in (tf.float32, tf.float64) + + if self.inputs_spec.is_singleton(): + input_shape = (None,) + self.inputs_spec.singleton().shape + else: + input_shape = [(None,) + spec.shape for spec in self.inputs_spec.values()] + + output_shape = self.keras_model.compute_output_shape(input_shape=input_shape) + assert isinstance(output_shape, tf.TensorShape) and output_shape.rank == 2 + output_shape = output_shape.as_list() + assert output_shape[0] is None + + return TensorSpec(type='float', shape=(output_shape[1],)) + + def initialize(self): + super().initialize() + + if self.inputs_spec.is_singleton(): + input_shape = (None,) + self.inputs_spec.singleton().shape + else: + input_shape = [(None,) + spec.shape for spec in self.inputs_spec.values()] + + self.keras_model.build(input_shape=input_shape) + + @tf_function(num_args=0) + def regularize(self): + regularization_loss = super().regularize() + + if len(self.keras_model.losses) > 0: + regularization_loss += tf.math.add_n(inputs=self.keras_model.losses) + + return regularization_loss + + @tf_function(num_args=4) + def apply(self, *, x, horizons, internals, deterministic, independent): + if x.is_singleton(): + inputs = x.singleton() + else: + inputs = list(x.values()) + + x = self.keras_model(inputs=inputs, training=(not independent)) + + return tf_util.cast(x=x, dtype='float'), internals diff --git a/tensorforce/core/networks/layer.py b/tensorforce/core/networks/layer.py deleted file mode 100755 index 2ef6d603a..000000000 --- a/tensorforce/core/networks/layer.py +++ /dev/null @@ -1,1162 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Collection of custom layer implementations. -""" - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from math import sqrt -import numpy as np -import tensorflow as tf - -from tensorforce import TensorForceError, util -import tensorforce.core.networks - - -class Layer(object): - """ - Base class for network layers. - """ - - def __init__(self, scope='layer', summary_labels=None): - """ - Layer. - """ - self.scope = scope - self.summary_labels = set(summary_labels or ()) - - self.named_tensors = dict() - self.variables = dict() - self.all_variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.all_variables[name] = variable - if kwargs.get('trainable', True): - self.variables[name] = variable - if 'variables' in self.summary_labels: - summary = tf.summary.histogram(name=name, values=variable) - self.summaries.append(summary) - return variable - - self.apply = tf.make_template( - name_=(scope + '/apply'), - func_=self.tf_apply, - custom_getter_=custom_getter - ) - self.regularization_loss = tf.make_template( - name_=(scope + '/regularization-loss'), - func_=self.tf_regularization_loss, - custom_getter_=custom_getter - ) - - def tf_apply(self, x, update): - """ - Creates the TensorFlow operations for applying the layer to the given input. - - Args: - x: Layer input tensor. - update: Boolean tensor indicating whether this call happens during an update. - - Returns: - Layer output tensor. - """ - raise NotImplementedError - - def tf_regularization_loss(self): - """ - Creates the TensorFlow operations for the layer regularization loss. - - Returns: - Regularization loss tensor. - """ - return None - - def tf_tensors(self, named_tensors): - """ - Attaches the named_tensors dictionary to the layer for examination and update. - - Args: - named_tensors: Dictionary of named tensors to be used as Input's or recorded from outputs - - Returns: - NA - """ - self.named_tensors = named_tensors - - def internals_spec(self): - """ - Returns the internal states specification. - - Returns: - Internal states specification - """ - return dict() - - def get_variables(self, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the layer. - - Returns: - List of variables. - """ - if include_nontrainable: - return [self.all_variables[key] for key in sorted(self.all_variables)] - else: - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the layer. - - Returns: - List of summaries. - """ - return self.summaries - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a layer from a specification dict. - """ - layer = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.networks.layers, - kwargs=kwargs - ) - assert isinstance(layer, Layer) - return layer - - -class TFLayer(Layer): - """ - Wrapper class for TensorFlow layers. - """ - - tf_layers = dict( - average_pooling1d=tf.layers.AveragePooling1D, - average_pooling2d=tf.layers.AveragePooling2D, - average_pooling3d=tf.layers.AveragePooling3D, - batch_normalization=tf.layers.BatchNormalization, - conv1d=tf.layers.Conv1D, - conv2d=tf.layers.Conv2D, - conv2d_transpose=tf.layers.Conv2DTranspose, - conv3d=tf.layers.Conv3D, - conv3d_transpose=tf.layers.Conv3DTranspose, - dense=tf.layers.Dense, - dropout=tf.layers.Dropout, - flatten=tf.layers.Flatten, - max_pooling1d=tf.layers.MaxPooling1D, - max_pooling2d=tf.layers.MaxPooling2D, - max_pooling3d=tf.layers.MaxPooling3D, - separable_conv2d=tf.layers.SeparableConv2D - ) - - def __init__(self, layer, scope='tf-layer', summary_labels=(), **kwargs): - """ - Creates a new layer instance of a TensorFlow layer. - - Args: - name: The name of the layer, one of 'dense'. - **kwargs: Additional arguments passed on to the TensorFlow layer constructor. - """ - self.layer_spec = layer - self.layer = util.get_object(obj=layer, predefined_objects=TFLayer.tf_layers, kwargs=kwargs) - self.first_scope = None - - super(TFLayer, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - if self.first_scope is None: - # Store scope of first call since regularization losses will be registered there. - self.first_scope = tf.contrib.framework.get_name_scope() - return self.layer(inputs=x, training=update) - - def tf_regularization_loss(self): - regularization_losses = tf.get_collection( - key=tf.GraphKeys.REGULARIZATION_LOSSES, - scope=self.first_scope - ) - if len(regularization_losses) > 0: - return tf.add_n(inputs=regularization_losses) - else: - return None - - -class Nonlinearity(Layer): - """ - Non-linearity layer applying a non-linear transformation. - """ - - def __init__(self, - name='relu', - alpha=None, - beta=1.0, - max=None, - min=None, - scope='nonlinearity', - summary_labels=() - ): - """ - Non-linearity activation layer. - - Args: - name: Non-linearity name, one of 'elu', 'relu', 'selu', 'sigmoid', 'swish', 'softmax', - 'leaky_relu' (or 'lrelu'), 'crelu', 'softmax', 'softplus', 'softsign', 'tanh' or 'none'. - alpha: (float|int) Alpha value for leaky Relu - beta: (float|int|'learn') Beta value or 'learn' to train value (default 1.0) - max: (float|int) maximum (beta * input) value passed to non-linearity function - min: (float|int) minimum (beta * input) value passed to non-linearity function - summary_labels: Requested summary labels for tensorboard export, add 'beta' to watch beta learning - """ - self.name = name - self.alpha = None - self.max = None - self.min = None - self.beta_learn = False - super(Nonlinearity, self).__init__(scope=scope, summary_labels=summary_labels) - - if max is not None: - self.max = float(max) - - if min is not None: - self.min = float(min) - - if alpha is not None: - self.alpha = float(alpha) - - if beta == 'learn': - self.beta_learn = True - self.beta = None - else: - self.beta = tf.constant(float(beta), dtype=util.tf_dtype('float')) - - def tf_apply(self, x, update): - if self.beta_learn: - self.beta = tf.get_variable( - name='beta', - shape=(), - dtype=tf.float32, - initializer=tf.ones_initializer() - ) - - if self.max is not None: - x = tf.minimum(x=(self.beta * x), y=self.max) - - if self.min is not None: - x = tf.maximum(x=(self.beta * x), y=self.min) - - if self.name == 'elu': - x = tf.nn.elu(features=(self.beta * x)) - - elif self.name == 'none': - x = tf.identity(input=(self.beta * x)) - - elif self.name == 'relu': - x = tf.nn.relu(features=(self.beta * x)) - if 'relu' in self.summary_labels: - non_zero = tf.cast(x=tf.count_nonzero(input_tensor=x), dtype=tf.float32) - size = tf.cast(x=tf.reduce_prod(input_tensor=tf.shape(input=x)), dtype=tf.float32) - summary = tf.summary.scalar(name='relu', tensor=(non_zero / size)) - self.summaries.append(summary) - - elif self.name == 'selu': - # https://arxiv.org/pdf/1706.02515.pdf - x = tf.nn.selu(features=(self.beta * x)) - - elif self.name == 'sigmoid': - x = tf.sigmoid(x=(self.beta * x)) - - elif self.name == 'swish': - # https://arxiv.org/abs/1710.05941 - x = tf.sigmoid(x=(self.beta * x)) * x - - elif self.name == 'lrelu' or self.name == 'leaky_relu': - if self.alpha is None: - # Default alpha value for leaky_relu - self.alpha = 0.2 - x = tf.nn.leaky_relu(features=(self.beta * x), alpha=self.alpha) - - elif self.name == 'crelu': - x = tf.nn.crelu(features=(self.beta * x)) - - elif self.name == 'softmax': - x = tf.nn.softmax(logits=(self.beta * x)) - - elif self.name == 'softplus': - x = tf.nn.softplus(features=(self.beta * x)) - - elif self.name == 'softsign': - x = tf.nn.softsign(features=(self.beta * x)) - - elif self.name == 'tanh': - x = tf.nn.tanh(x=(self.beta * x)) - - else: - raise TensorForceError('Invalid non-linearity: {}'.format(self.name)) - - if 'beta' in self.summary_labels: - summary = tf.summary.scalar(name='beta', tensor=self.beta) - self.summaries.append(summary) - - return x - - -class Dropout(Layer): - """ - Dropout layer. If using dropout, add this layer after inputs and after dense layers. For - LSTM, dropout is handled independently as an argument. Not available for Conv2d yet. - """ - - def __init__(self, rate=0.0, scope='dropout', summary_labels=()): - self.rate = rate - super(Dropout, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - return tf.cond( - pred=update, - true_fn=(lambda: tf.nn.dropout(x=x, keep_prob=(1.0 - self.rate))), - false_fn=(lambda: tf.identity(input=x)) - ) - - -class Flatten(Layer): - """ - Flatten layer reshaping the input. - """ - - def __init__(self, scope='flatten', summary_labels=()): - super(Flatten, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - return tf.reshape(tensor=x, shape=(-1, util.prod(util.shape(x)[1:]))) - - -class Pool2d(Layer): - """ - 2-dimensional pooling layer. - """ - - def __init__( - self, - pooling_type='max', - window=2, - stride=2, - padding='SAME', - scope='pool2d', - summary_labels=() - ): - """ - 2-dimensional pooling layer. - - Args: - pooling_type: Either 'max' or 'average'. - window: Pooling window size, either an integer or pair of integers. - stride: Pooling stride, either an integer or pair of integers. - padding: Pooling padding, one of 'VALID' or 'SAME'. - """ - self.pooling_type = pooling_type - if isinstance(window, int): - self.window = (1, window, window, 1) - elif len(window) == 2: - self.window = (1, window[0], window[1], 1) - else: - raise TensorForceError('Invalid window {} for pool2d layer, must be of size 2'.format(window)) - if isinstance(stride, int): - self.stride = (1, stride, stride, 1) - elif len(window) == 2: - self.stride = (1, stride[0], stride[1], 1) - else: - raise TensorForceError('Invalid stride {} for pool2d layer, must be of size 2'.format(stride)) - self.padding = padding - super(Pool2d, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - if self.pooling_type == 'average': - x = tf.nn.avg_pool(value=x, ksize=self.window, strides=self.stride, padding=self.padding) - - elif self.pooling_type == 'max': - x = tf.nn.max_pool(value=x, ksize=self.window, strides=self.stride, padding=self.padding) - - else: - raise TensorForceError('Invalid pooling type: {}'.format(self.name)) - - return x - - -class Embedding(Layer): - """ - Embedding layer. - """ - - def __init__( - self, - indices, - size, - l2_regularization=0.0, - l1_regularization=0.0, - scope='embedding', - summary_labels=() - ): - """ - Embedding layer. - - Args: - indices: Number of embedding indices. - size: Embedding size. - l2_regularization: L2 regularization weight. - l1_regularization: L1 regularization weight. - """ - self.indices = indices - self.size = size - self.l2_regularization = l2_regularization - self.l1_regularization = l1_regularization - super(Embedding, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - stddev = min(0.1, sqrt(1.0 / self.size)) - weights_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) - self.weights = tf.get_variable( - name='embeddings', - shape=(self.indices, self.size), - dtype=tf.float32, - initializer=weights_init - ) - return tf.nn.embedding_lookup(params=self.weights, ids=x) - - def tf_regularization_loss(self): - regularization_loss = super(Embedding, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - if self.l2_regularization > 0.0: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.weights)) - - if self.l1_regularization > 0.0: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.weights))) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - -class Linear(Layer): - """ - Linear fully-connected layer. - """ - - def __init__( - self, - size, - weights=None, - bias=True, - l2_regularization=0.0, - l1_regularization=0.0, - scope='linear', - summary_labels=() - ): - """ - Linear layer. - - Args: - size: Layer size. - weights: Weight initialization, random if None. - bias: Bias initialization, random if True, no bias added if False. - l2_regularization: L2 regularization weight. - l1_regularization: L1 regularization weight. - """ - self.size = size - self.weights_init = weights - self.bias_init = bias - self.l2_regularization = l2_regularization - self.l1_regularization = l1_regularization - super(Linear, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update=False): - if util.rank(x) != 2: - raise TensorForceError( - 'Invalid input rank for linear layer: {}, must be 2.'.format(util.rank(x)) - ) - - if self.size is None: # If size is None than Output Matches Input, required for Skip Connections - self.size = x.shape[1].value - - weights_shape = (x.shape[1].value, self.size) - - if self.weights_init is None: - stddev = min(0.1, sqrt(2.0 / (x.shape[1].value + self.size))) - self.weights_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) - - elif isinstance(self.weights_init, dict): - if 'name' in self.weights_init: - if self.weights_init['name'] == 'msra': - slope = 0.25 - if 'slope' in self.weights_init: - slope = self.weights_init['slope'] - magnitude = 2.0 / (1.0 + slope ** 2) - stddev = sqrt(magnitude * 2.0 / (x.shape[1].value + self.size)) - self.weights_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) - else: - raise TensorForceError( - 'Linear weights init with dict does not has name attribute, weight_init={}'.format(self.weights_init) - ) - - elif isinstance(self.weights_init, float): - if self.weights_init == 0.0: - self.weights_init = tf.zeros_initializer(dtype=tf.float32) - else: - self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) - - elif isinstance(self.weights_init, list): - self.weights_init = np.asarray(self.weights_init, dtype=np.float32) - if self.weights_init.shape != weights_shape: - raise TensorForceError( - 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) - ) - self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) - - elif isinstance(self.weights_init, np.ndarray): - if self.weights_init.shape != weights_shape: - raise TensorForceError( - 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) - ) - self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) - - elif isinstance(self.weights_init, tf.Tensor): - if util.shape(self.weights_init) != weights_shape: - raise TensorForceError( - 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) - ) - - bias_shape = (self.size,) - - if isinstance(self.bias_init, bool): - if self.bias_init: - self.bias_init = tf.zeros_initializer(dtype=tf.float32) - else: - self.bias_init = None - - elif isinstance(self.bias_init, float): - if self.bias_init == 0.0: - self.bias_init = tf.zeros_initializer(dtype=tf.float32) - else: - self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) - - elif isinstance(self.bias_init, list): - self.bias_init = np.asarray(self.bias_init, dtype=np.float32) - if self.bias_init.shape != bias_shape: - raise TensorForceError( - 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) - ) - self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) - - elif isinstance(self.bias_init, np.ndarray): - if self.bias_init.shape != bias_shape: - raise TensorForceError( - 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) - ) - self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) - - elif isinstance(self.bias_init, tf.Tensor): - if util.shape(self.bias_init) != bias_shape: - raise TensorForceError( - 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) - ) - - if isinstance(self.weights_init, tf.Tensor): - self.weights = self.weights_init - else: - self.weights = tf.get_variable( - name='W', - shape=weights_shape, - dtype=tf.float32, - initializer=self.weights_init - ) - - x = tf.matmul(a=x, b=self.weights) - - if self.bias_init is None: - self.bias = None - - else: - if isinstance(self.bias_init, tf.Tensor): - self.bias = self.bias_init - else: - self.bias = tf.get_variable(name='b', shape=bias_shape, dtype=tf.float32, initializer=self.bias_init) - - x = tf.nn.bias_add(value=x, bias=self.bias) - - return x - - def tf_regularization_loss(self): - regularization_loss = super(Linear, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - if self.l2_regularization > 0.0: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.weights)) - if self.bias is not None: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.bias)) - - if self.l1_regularization > 0.0: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.weights))) - if self.bias is not None: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.bias))) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - -class Dense(Layer): - """ - Dense layer, i.e. linear fully connected layer with subsequent non-linearity. - """ - - def __init__( - self, - size=None, - weights=None, - bias=True, - activation='relu', - l2_regularization=0.0, - l1_regularization=0.0, - skip=False, - scope='dense', - summary_labels=() - ): - """ - Dense layer. - - Args: - size: Layer size, if None than input size matches the output size of the layer - weights: Weight initialization, random if None. - bias: If true, bias is added. - activation: Type of nonlinearity, or dict with name & arguments - l2_regularization: L2 regularization weight. - l1_regularization: L1 regularization weight. - skip: Add skip connection like ResNet (https://arxiv.org/pdf/1512.03385.pdf), - doubles layers and ShortCut from Input to output - """ - self.skip = skip - if self.skip and size is not None: - raise TensorForceError( - 'Dense Layer SKIP connection needs Size=None, uses input shape ' - 'sizes to create skip connection network, please delete "size" parameter' - ) - - self.linear = Linear( - size=size, - weights=weights, - bias=bias, - l2_regularization=l2_regularization, - l1_regularization=l1_regularization, - summary_labels=summary_labels - ) - if self.skip: - self.linear_skip = Linear( - size=size, - bias=bias, - l2_regularization=l2_regularization, - l1_regularization=l1_regularization, - summary_labels=summary_labels - ) - # TODO: Consider creating two nonlinearity variables when skip is used and learning beta - # Right now, only a single beta can be learned - self.nonlinearity = Nonlinearity(summary_labels=summary_labels, **util.prepare_kwargs(activation)) - super(Dense, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - xl1 = self.linear.apply(x=x, update=update) - xl1 = self.nonlinearity.apply(x=xl1, update=update) - if self.skip: - xl2 = self.linear_skip.apply(x=xl1, update=update) - xl2 = self.nonlinearity.apply(x=(xl2 + x), update=update) #add input back in as skip connection per paper - else: - xl2 = xl1 - - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=xl2) - self.summaries.append(summary) - - return xl2 - - def tf_regularization_loss(self): - regularization_loss = super(Dense, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - regularization_loss = self.linear.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - regularization_loss = self.nonlinearity.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if self.skip: - regularization_loss = self.linear_skip.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - layer_variables = super(Dense, self).get_variables(include_nontrainable=include_nontrainable) - linear_variables = self.linear.get_variables(include_nontrainable=include_nontrainable) - if self.skip: - linear_variables = linear_variables \ - + self.linear_skip.get_variables(include_nontrainable=include_nontrainable) - nonlinearity_variables = self.nonlinearity.get_variables(include_nontrainable=include_nontrainable) - - return layer_variables + linear_variables + nonlinearity_variables - - def get_summaries(self): - layer_summaries = super(Dense, self).get_summaries() - linear_summaries = self.linear.get_summaries() - nonlinearity_summaries = self.nonlinearity.get_summaries() - - return layer_summaries + linear_summaries + nonlinearity_summaries - - -class Dueling(Layer): - """ - Dueling layer, i.e. Duel pipelines for Exp & Adv to help with stability - """ - - def __init__( - self, - size, - bias=False, - activation='none', - l2_regularization=0.0, - l1_regularization=0.0, - output=None, - scope='dueling', - summary_labels=() - ): - """ - Dueling layer. - - [Dueling Networks] (https://arxiv.org/pdf/1511.06581.pdf) - Implement Y = Expectation[x] + (Advantage[x] - Mean(Advantage[x])) - - Args: - size: Layer size. - bias: If true, bias is added. - activation: Type of nonlinearity, or dict with name & arguments - l2_regularization: L2 regularization weight. - l1_regularization: L1 regularization weight. - output: None or tuple of output names for ('expectation','advantage','mean_advantage') - """ - # Expectation is broadcast back over advantage values so output is of size 1 - self.expectation_layer = Linear( - size=1, bias=bias, - l2_regularization=l2_regularization, - l1_regularization=l1_regularization, - summary_labels=summary_labels - ) - self.advantage_layer = Linear( - size=size, - bias=bias, - l2_regularization=l2_regularization, - l1_regularization=l1_regularization, - summary_labels=summary_labels - ) - self.output = output - self.nonlinearity = Nonlinearity(summary_labels=summary_labels, **util.prepare_kwargs(activation)) - super(Dueling, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - expectation = self.expectation_layer.apply(x=x, update=update) - advantage = self.advantage_layer.apply(x=x, update=update) - mean_advantage = tf.reduce_mean(input_tensor=advantage, axis=1, keep_dims=True) - - # Record outputs in named tensor dictionary if passed - if type(self.output) is tuple and len(self.output) == 3: - self.named_tensors[self.output[0]] = expectation - self.named_tensors[self.output[1]] = advantage - mean_advantage - self.named_tensors[self.output[2]] = mean_advantage - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name=self.output[0], values=expectation) - self.summaries.append(summary) - summary = tf.summary.histogram(name=self.output[1], values=advantage - mean_advantage) - self.summaries.append(summary) - summary = tf.summary.histogram(name=self.output[2], values=mean_advantage) - self.summaries.append(summary) - - x = expectation + advantage - mean_advantage - - x = self.nonlinearity.apply(x=x, update=update) - - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=x) - self.summaries.append(summary) - - return x - - def tf_regularization_loss(self): - regularization_loss = super(Dueling, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - regularization_loss = self.expectation_layer.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - regularization_loss = self.advantage_layer.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - layer_variables = super(Dueling, self).get_variables(include_nontrainable=include_nontrainable) - expectation_layer_variables = self.expectation_layer.get_variables(include_nontrainable=include_nontrainable) - advantage_layer_variables = self.advantage_layer.get_variables(include_nontrainable=include_nontrainable) - nonlinearity_variables = self.nonlinearity.get_variables(include_nontrainable=include_nontrainable) - - return layer_variables + expectation_layer_variables + advantage_layer_variables + nonlinearity_variables - - def get_summaries(self): - layer_summaries = super(Dueling, self).get_summaries() - expectation_layer_summaries = self.expectation_layer.get_summaries() - advantage_layer_summaries = self.advantage_layer.get_summaries() - nonlinearity_summaries = self.nonlinearity.get_summaries() - - return layer_summaries + expectation_layer_summaries + advantage_layer_summaries + nonlinearity_summaries - - -class Conv1d(Layer): - """ - 1-dimensional convolutional layer. - """ - - def __init__( - self, - size, - window=3, - stride=1, - padding='SAME', - bias=True, - activation='relu', - l2_regularization=0.0, - l1_regularization=0.0, - scope='conv1d', - summary_labels=() - ): - """ - 1D convolutional layer. - - Args: - size: Number of filters - window: Convolution window size - stride: Convolution stride - padding: Convolution padding, one of 'VALID' or 'SAME' - bias: If true, a bias is added - activation: Type of nonlinearity, or dict with name & arguments - l2_regularization: L2 regularization weight - l1_regularization: L1 regularization weight - """ - self.size = size - self.window = window - self.stride = stride - self.padding = padding - self.bias = bias - self.l2_regularization = l2_regularization - self.l1_regularization = l1_regularization - self.nonlinearity = Nonlinearity(summary_labels=summary_labels, **util.prepare_kwargs(activation)) - super(Conv1d, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - if util.rank(x) != 3: - raise TensorForceError('Invalid input rank for conv1d layer: {}, must be 3'.format(util.rank(x))) - - filters_shape = (self.window, x.shape[2].value, self.size) - stddev = min(0.1, sqrt(2.0 / self.size)) - filters_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) - self.filters = tf.get_variable(name='W', shape=filters_shape, dtype=tf.float32, initializer=filters_init) - x = tf.nn.conv1d(value=x, filters=self.filters, stride=self.stride, padding=self.padding) - - if self.bias: - bias_shape = (self.size,) - bias_init = tf.zeros_initializer(dtype=tf.float32) - self.bias = tf.get_variable(name='b', shape=bias_shape, dtype=tf.float32, initializer=bias_init) - x = tf.nn.bias_add(value=x, bias=self.bias) - - x = self.nonlinearity.apply(x=x, update=update) - - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=x) - self.summaries.append(summary) - - return x - - def tf_regularization_loss(self): - regularization_loss = super(Conv1d, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - if self.l2_regularization > 0.0: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.filters)) - if self.bias is not None: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.bias)) - - if self.l1_regularization > 0.0: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.filters))) - if self.bias is not None: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.bias))) - - regularization_loss = self.nonlinearity.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - layer_variables = super(Conv1d, self).get_variables(include_nontrainable=include_nontrainable) - nonlinearity_variables = self.nonlinearity.get_variables(include_nontrainable=include_nontrainable) - - return layer_variables + nonlinearity_variables - - def get_summaries(self): - layer_summaries = super(Conv1d, self).get_summaries() - nonlinearity_summaries = self.nonlinearity.get_summaries() - - return layer_summaries + nonlinearity_summaries - - -class Conv2d(Layer): - """ - 2-dimensional convolutional layer. - """ - - def __init__( - self, - size, - window=3, - stride=1, - padding='SAME', - bias=True, - activation='relu', - l2_regularization=0.0, - l1_regularization=0.0, - scope='conv2d', - summary_labels=() - ): - """ - 2D convolutional layer. - - Args: - size: Number of filters - window: Convolution window size, either an integer or pair of integers. - stride: Convolution stride, either an integer or pair of integers. - padding: Convolution padding, one of 'VALID' or 'SAME' - bias: If true, a bias is added - activation: Type of nonlinearity, or dict with name & arguments - l2_regularization: L2 regularization weight - l1_regularization: L1 regularization weight - """ - self.size = size - if isinstance(window, int): - self.window = (window, window) - elif len(window) == 2: - self.window = tuple(window) - else: - raise TensorForceError('Invalid window {} for conv2d layer, must be of size 2'.format(window)) - self.stride = stride - self.padding = padding - self.bias = bias - self.l2_regularization = l2_regularization - self.l1_regularization = l1_regularization - self.nonlinearity = Nonlinearity(summary_labels=summary_labels, **util.prepare_kwargs(activation)) - super(Conv2d, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update): - if util.rank(x) != 4: - raise TensorForceError('Invalid input rank for conv2d layer: {}, must be 4'.format(util.rank(x))) - - filters_shape = self.window + (x.shape[3].value, self.size) - stddev = min(0.1, sqrt(2.0 / self.size)) - filters_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) - self.filters = tf.get_variable(name='W', shape=filters_shape, dtype=tf.float32, initializer=filters_init) - stride_h, stride_w = self.stride if type(self.stride) is tuple else (self.stride, self.stride) - x = tf.nn.conv2d(input=x, filter=self.filters, strides=(1, stride_h, stride_w, 1), padding=self.padding) - - if self.bias: - bias_shape = (self.size,) - bias_init = tf.zeros_initializer(dtype=tf.float32) - self.bias = tf.get_variable(name='b', shape=bias_shape, dtype=tf.float32, initializer=bias_init) - x = tf.nn.bias_add(value=x, bias=self.bias) - - x = self.nonlinearity.apply(x=x, update=update) - - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=x) - self.summaries.append(summary) - - return x - - def tf_regularization_loss(self): - regularization_loss = super(Conv2d, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() - else: - losses = [regularization_loss] - - if self.l2_regularization > 0.0: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.filters)) - if self.bias is not None: - losses.append(self.l2_regularization * tf.nn.l2_loss(t=self.bias)) - - if self.l1_regularization > 0.0: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.filters))) - if self.bias is not None: - losses.append(self.l1_regularization * tf.reduce_sum(input_tensor=tf.abs(x=self.bias))) - - regularization_loss = self.nonlinearity.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) - - if len(losses) > 0: - return tf.add_n(inputs=losses) - else: - return None - - def get_variables(self, include_nontrainable=False): - layer_variables = super(Conv2d, self).get_variables(include_nontrainable=include_nontrainable) - nonlinearity_variables = self.nonlinearity.get_variables(include_nontrainable=include_nontrainable) - - return layer_variables + nonlinearity_variables - - def get_summaries(self): - layer_summaries = super(Conv2d, self).get_summaries() - nonlinearity_summaries = self.nonlinearity.get_summaries() - - return layer_summaries + nonlinearity_summaries - - -class InternalLstm(Layer): - """ - Long short-term memory layer for internal state management. - """ - - def __init__(self, size, dropout=None, scope='internal_lstm', summary_labels=()): - """ - LSTM layer. - - Args: - size: LSTM size. - dropout: Dropout rate. - """ - self.size = size - self.dropout = dropout - super(InternalLstm, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update, state): - if util.rank(x) != 2: - raise TensorForceError( - 'Invalid input rank for internal lstm layer: {}, must be 2.'.format(util.rank(x)) - ) - - state = tf.contrib.rnn.LSTMStateTuple(c=state[:, 0, :], h=state[:, 1, :]) - - self.lstm_cell = tf.contrib.rnn.LSTMCell(num_units=self.size) - - if self.dropout is not None: - keep_prob = tf.cond(pred=update, true_fn=(lambda: 1.0 - self.dropout), false_fn=(lambda: 1.0)) - self.lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=self.lstm_cell, output_keep_prob=keep_prob) - - x, state = self.lstm_cell(inputs=x, state=state) - - state = tf.stack(values=(state.c, state.h), axis=1) - - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=x) - self.summaries.append(summary) - - return x, dict(state=state) - - def internals_spec(self): - return dict(state=dict( - type='float', - shape=(2, self.size), - initialization='zeros' - )) - - -class Lstm(Layer): - - def __init__(self, size, dropout=None, scope='lstm', summary_labels=(), return_final_state=True): - """ - LSTM layer. - - Args: - size: LSTM size. - dropout: Dropout rate. - """ - self.size = size - self.dropout = dropout - self.return_final_state = return_final_state - super(Lstm, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update, sequence_length=None): - if util.rank(x) != 3: - raise TensorForceError('Invalid input rank for lstm layer: {}, must be 3.'.format(util.rank(x))) - - lstm_cell = tf.contrib.rnn.LSTMCell(num_units=self.size) - if 'activations' in self.summary_labels: - summary = tf.summary.histogram(name='activations', values=x) - self.summaries.append(summary) - - x, state = tf.nn.dynamic_rnn( - cell=lstm_cell, - inputs=x, - sequence_length=sequence_length, - dtype=tf.float32 - ) - - # This distinction is so we can stack multiple LSTM layers - if self.return_final_state: - return tf.concat(values=(state.c, state.h), axis=1) - else: - return x diff --git a/tensorforce/core/networks/network.py b/tensorforce/core/networks/network.py index 02c22c539..34f26f022 100755 --- a/tensorforce/core/networks/network.py +++ b/tensorforce/core/networks/network.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,298 +13,425 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - from collections import Counter -import json -import os import tensorflow as tf -from tensorforce import util, TensorForceError -from tensorforce.core.networks import Layer +from tensorforce import TensorforceError +from tensorforce.core import ArrayDict, Module, SignatureDict, TensorDict, TensorSpec, \ + TensorsSpec, tf_function, tf_util +from tensorforce.core.layers import Block, Layer, layer_modules, MultiInputLayer, \ + NondeterministicLayer, PreprocessingLayer, Register, Reuse, StatefulLayer, TemporalLayer +from tensorforce.core.parameters import Parameter -class Network(object): +class Network(Module): """ Base class for neural networks. - """ - def __init__(self, scope='network', summary_labels=None): - """ - Neural network. - """ - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.all_variables = dict() - self.summaries = list() - self.named_tensors = dict() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.all_variables[name] = variable - if kwargs.get('trainable', True): - self.variables[name] = variable - if 'variables' in self.summary_labels: - summary = tf.summary.histogram(name=name, values=variable) - self.summaries.append(summary) - return variable - - self.apply = tf.make_template( - name_=(scope + '/apply'), - func_=self.tf_apply, - custom_getter_=custom_getter - ) - self.regularization_loss = tf.make_template( - name_=(scope + '/regularization-loss'), - func_=self.tf_regularization_loss, - custom_getter_=custom_getter - ) + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + inputs_spec (specification): internal use. + outputs (iter[string]): internal use. + """ - def tf_apply(self, x, internals, update, return_internals=False): - """ - Creates the TensorFlow operations for applying the network to the given input. + def __init__( + self, *, device=None, l2_regularization=None, name=None, inputs_spec=None, outputs=None + ): + super().__init__(name=name, device=device, l2_regularization=l2_regularization) - Args: - x: Network input tensor or dict of input tensors. - internals: List of prior internal state tensors - update: Boolean tensor indicating whether this call happens during an update. - return_internals: If true, also returns posterior internal state tensors + self.inputs_spec = inputs_spec - Returns: - Network output tensor, plus optionally list of posterior internal state tensors - """ - raise NotImplementedError + if outputs is None: + self.outputs = outputs + else: + self.outputs = tuple(outputs) + if any(not isinstance(output, str) for output in self.outputs): + raise TensorforceError.value( + name='LayerbasedNetwork', argument='outputs', value=self.outputs + ) - def tf_regularization_loss(self): - """ - Creates the TensorFlow operations for the network regularization loss. + def get_architecture(self): + return self.__class__.__name__ - Returns: - Regularization loss tensor - """ - return None + def output_spec(self): + raise NotImplementedError + @property def internals_spec(self): - """ - Returns the internal states specification. - - Returns: - Internal states specification - """ - return dict() - - def get_variables(self, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the network. - - Returns: - List of variables - """ - if include_nontrainable: - return [self.all_variables[key] for key in sorted(self.all_variables)] - else: - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the network. - - Returns: - List of summaries - """ - return self.summaries - - def get_named_tensor(self, name): - """ - Returns a named tensor if available. - - Returns: - valid: True if named tensor found, False otherwise - tensor: If valid, will be a tensor, otherwise None - """ - if name in self.named_tensors: - return True, self.named_tensors[name] - else: - return False, None + return TensorsSpec() - def get_list_of_named_tensor(self): - """ - Returns a list of the names of tensors available. + def internals_init(self): + return ArrayDict() - Returns: - List of the names of tensors available. - """ - return list(self.named_tensors) + def max_past_horizon(self, *, on_policy): + return 0 - def set_named_tensor(self, name, tensor): - """ - Returns the TensorFlow summaries reported by the network. + def input_signature(self, *, function): + if function == 'apply': + return SignatureDict( + x=self.inputs_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) - Returns: - None - """ - self.named_tensors[name] = tensor + elif function == 'past_horizon': + return SignatureDict() - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a network from a specification dict. - """ - if isinstance(spec, list) and len(spec) > 0: - # Default case is a list of dict() with each dict describing a layer. - if type(spec[0]) is dict: - network = util.get_object( - obj=spec, - default_object=LayeredNetwork, - kwargs=kwargs - ) - # ComplexLayeredNetwork forced for testing - if type(spec[0]) is list: - # Spec contains List of List of Dict(), Complex network specification - # Load "ComplexLayeredNetwork" here to avoid a recurring loop which fails - from tensorforce.core.networks.complex_network import ComplexLayeredNetwork - network = util.get_object( - obj=spec, - default_object=ComplexLayeredNetwork, - kwargs=kwargs - ) else: - network = util.get_object( - obj=spec, - default_object=LayeredNetwork, - kwargs=kwargs + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'apply': + return SignatureDict( + x=self.output_spec().signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + elif function == 'past_horizon': + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=False) ) - # If neither format, invalid spec and will fail on assert - assert isinstance(network, Network) - return network + else: + return super().output_signature(function=function) + + @tf_function(num_args=0) + def past_horizon(self, *, on_policy): + return tf_util.constant(value=0, dtype='int') -class LayerBasedNetwork(Network): + @tf_function(num_args=4) + def apply(self, *, x, horizons, internals, deterministic, independent): + raise NotImplementedError + + +class LayerbasedNetwork(Network): """ - Base class for networks using TensorForce layers. + Base class for networks using Tensorforce layers. """ - def __init__(self, scope='layerbased-network', summary_labels=()): - """ - Layer-based network. - """ - super(LayerBasedNetwork, self).__init__(scope=scope, summary_labels=summary_labels) - self.layers = list() - - def add_layer(self, layer): - self.layers.append(layer) + def __init__(self, *, name, inputs_spec, device=None, l2_regularization=None, outputs=None): + super().__init__( + name=name, inputs_spec=inputs_spec, device=device, l2_regularization=l2_regularization + ) - def tf_regularization_loss(self): - regularization_loss = super(LayerBasedNetwork, self).tf_regularization_loss() - if regularization_loss is None: - losses = list() + if self.inputs_spec.is_singleton(): + self.registered_tensors_spec = TensorsSpec(state=self.inputs_spec.singleton()) else: - losses = [regularization_loss] + self.registered_tensors_spec = self.inputs_spec.copy() - for layer in self.layers: - regularization_loss = layer.regularization_loss() - if regularization_loss is not None: - losses.append(regularization_loss) + self._output_spec = self.inputs_spec.value() - if len(losses) > 0: - return tf.add_n(inputs=losses) + def invalid_layer_types(self): + return (PreprocessingLayer,) + + def output_spec(self): + if self.outputs is None: + return self._output_spec else: - return None + self.outputs = tuple( + output for output in self.outputs if output in self.registered_tensors_spec + ) + output_spec = TensorsSpec(embedding=self._output_spec) + output_spec.update(self.registered_tensors_spec[self.outputs]) + return output_spec + @staticmethod + def _recursive_temporal_layers(*, layer, fn): + if isinstance(layer, TemporalLayer): + fn(layer) + elif isinstance(layer, Block): + for block_layer in layer.this_submodules: + LayerbasedNetwork._recursive_temporal_layers(layer=block_layer, fn=fn) + elif isinstance(layer, Reuse): + LayerbasedNetwork._recursive_temporal_layers(layer=layer.reused_layer, fn=fn) + + @property def internals_spec(self): - internals_spec = dict() - for layer in self.layers: - for name, internal_spec in layer.internals_spec().items(): - internals_spec['{}_{}'.format(layer.scope, name)] = internal_spec + internals_spec = super().internals_spec + + def fn(layer): + internals_spec[layer.name] = layer.internals_spec + + for layer in self.this_submodules: + LayerbasedNetwork._recursive_temporal_layers(layer=layer, fn=fn) + return internals_spec - def get_variables(self, include_nontrainable=False): - network_variables = super(LayerBasedNetwork, self).get_variables( - include_nontrainable=include_nontrainable + def internals_init(self): + internals_init = super().internals_init() + + def fn(layer): + internals_init[layer.name] = layer.internals_init() + + for layer in self.this_submodules: + LayerbasedNetwork._recursive_temporal_layers(layer=layer, fn=fn) + + return internals_init + + def max_past_horizon(self, *, on_policy): + past_horizons = [super().max_past_horizon(on_policy=on_policy)] + + def fn(layer): + past_horizons.append(layer.max_past_horizon(on_policy=on_policy)) + + for layer in self.this_submodules: + LayerbasedNetwork._recursive_temporal_layers(layer=layer, fn=fn) + + return max(past_horizons) + + @tf_function(num_args=0) + def past_horizon(self, *, on_policy): + past_horizons = [super().past_horizon(on_policy=on_policy)] + + def fn(layer): + past_horizons.append(layer.past_horizon(on_policy=on_policy)) + + for layer in self.this_submodules: + LayerbasedNetwork._recursive_temporal_layers(layer=layer, fn=fn) + + return tf.math.reduce_max(input_tensor=tf.stack(values=past_horizons, axis=0), axis=0) + + def submodule( + self, *, name, module=None, modules=None, default_module=None, is_trainable=True, + is_saved=True, **kwargs + ): + # Module class and args + if modules is None: + modules = layer_modules + module_cls, args, kwargs = Module.get_module_class_and_args( + name=name, module=module, modules=modules, default_module=default_module, **kwargs + ) + if len(args) > 0: + assert len(kwargs) == 0 + module_cls = args[0] + + # Default input_spec + if not issubclass(module_cls, Layer): + pass + + elif kwargs.get('input_spec') is None: + if issubclass(module_cls, MultiInputLayer): + if 'tensors' not in kwargs: + raise TensorforceError.required(name='MultiInputLayer', argument='tensors') + tensors = kwargs['tensors'] + if isinstance(tensors, str): + tensors = (tensors,) + else: + tensors = tuple(tensors) + if tensors not in self.registered_tensors_spec: + raise TensorforceError.exists_not( + name='registered tensor', value=kwargs['tensors'] + ) + kwargs['input_spec'] = self.registered_tensors_spec[tensors] + + elif self._output_spec is None: + raise TensorforceError.required( + name='layer-based network', argument='first layer', expected='retrieve', + condition='multiple state/input components' + ) + + else: + kwargs['input_spec'] = self._output_spec + + elif issubclass(module_cls, MultiInputLayer): + raise TensorforceError.invalid(name='MultiInputLayer', argument='input_spec') + + layer = super().submodule( + module=module_cls, modules=modules, default_module=default_module, + is_trainable=is_trainable, is_saved=is_saved, **kwargs ) - layer_variables = [ - variable for layer in self.layers - for variable in layer.get_variables(include_nontrainable=include_nontrainable) - ] - return network_variables + layer_variables + if not isinstance(layer, (Layer, Parameter)): + raise TensorforceError.type( + name='layer-based network', argument='sub-module', value=layer + ) + + elif isinstance(layer, self.invalid_layer_types()): + raise TensorforceError.type( + name='network', argument='layer', value=layer, hint='invalid layer type' + ) + + if isinstance(layer, Layer): + self._output_spec = layer.output_spec() - def get_summaries(self): - network_summaries = super(LayerBasedNetwork, self).get_summaries() - layer_summaries = [summary for layer in self.layers for summary in layer.get_summaries()] + if isinstance(layer, Register): + if layer.tensor in self.registered_tensors_spec: + raise TensorforceError.exists(name='registered tensor', value=layer.tensor) + self.registered_tensors_spec[layer.tensor] = layer.output_spec() - return network_summaries + layer_summaries + return layer -class LayeredNetwork(LayerBasedNetwork): +class LayeredNetwork(LayerbasedNetwork): """ - Network consisting of a sequence of layers, which can be created from a specification dict. + Network consisting of Tensorforce layers (specification key: `custom` or `layered`), which can + be specified as either a list of layer specifications in the case of a standard sequential + layer-stack architecture, or as a list of list of layer specifications in the case of a more + complex architecture consisting of multiple sequential layer-stacks. Note that the final + action/value layer of the policy/baseline network is implicitly added, so the network output can + be of arbitrary size and use any activation function, and is only required to be a rank-one + embedding vector, or optionally have the same shape as the action in the case of a higher-rank + action shape. + + Args: + layers (iter[specification] | iter[iter[specification]]): Layers configuration, see the + [layers documentation](../modules/layers.html) + (required). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + inputs_spec (specification): internal use. + outputs (iter[string]): internal use. """ - def __init__(self, layers, scope='layered-network', summary_labels=()): - """ - Single-stack layered network. + # (requires layers as first argument) + def __init__( + self, layers, *, device=None, l2_regularization=None, name=None, inputs_spec=None, + outputs=None + ): + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, inputs_spec=inputs_spec, + outputs=outputs + ) - Args: - layers: List of layer specification dicts. - """ - self.layers_spec = layers - super(LayeredNetwork, self).__init__(scope=scope, summary_labels=summary_labels) + self.layers = self._parse_layers_spec(spec=layers, counter=Counter()) - layer_counter = Counter() - for layer_spec in self.layers_spec: - if isinstance(layer_spec['type'], str): - name = layer_spec['type'] - else: - name = 'layer' - scope = name + str(layer_counter[name]) - layer_counter[name] += 1 + def get_architecture(self): + architecture = LayeredNetwork._recursive_get_architecture(layer=self.layers) + while '----\n----' in architecture: + architecture = architecture.replace('----\n----', '----') + if architecture.startswith('----'): + architecture = architecture[4:] + return architecture - layer = Layer.from_spec( - spec=layer_spec, - kwargs=dict(scope=scope, summary_labels=summary_labels) + @staticmethod + def _recursive_get_architecture(*, layer): + if isinstance(layer, list): + return '----\n' + '\n'.join( + LayeredNetwork._recursive_get_architecture(layer=layer) for layer in layer ) - self.add_layer(layer=layer) - def tf_apply(self, x, internals, update, return_internals=False): - if isinstance(x, dict): - if len(x) != 1: - raise TensorForceError('Layered network must have only one input, but {} given.'.format(len(x))) - x = next(iter(x.values())) + else: + return layer.get_architecture() - next_internals = dict() - for layer in self.layers: - layer_internals = {name: internals['{}_{}'.format(layer.scope, name)] for name in layer.internals_spec()} + def _parse_layers_spec(self, *, spec, counter): + if isinstance(spec, list): + return [self._parse_layers_spec(spec=s, counter=counter) for s in spec] - if len(layer_internals) > 0: - x, layer_internals = layer.apply(x=x, update=update, **layer_internals) - for name, internal in layer_internals.items(): - next_internals['{}_{}'.format(layer.scope, name)] = internal + else: + if callable(spec): + spec = dict(type='function', function=spec) + elif isinstance(spec, str): + spec = dict(type=spec) + + # Deprecated + if spec.get('type') in ('internal_rnn', 'internal_lstm', 'internal_gru'): + raise TensorforceError.deprecated( + name='Network layers', argument=spec['type'], replacement=spec['type'][9:] + ) - else: - x = layer.apply(x=x, update=update) + if 'name' in spec: + spec = dict(spec) + name = spec.pop('name') - if return_internals: - return x, next_internals + else: + layer_type = spec.get('type') + if not isinstance(layer_type, str): + layer_type = 'layer' + name = layer_type + str(counter[layer_type]) + counter[layer_type] += 1 + + return self.submodule(name=name, module=spec) + + @tf_function(num_args=4) + def apply(self, *, x, horizons, internals, deterministic, independent): + if x.is_singleton(): + registered_tensors = TensorDict(state=x.singleton()) else: - return x + registered_tensors = x.copy() + x = x.value() + + temporal_layer_check = False + x, _ = LayeredNetwork._recursive_apply( + layer=self.layers, x=x, horizons=horizons, internals=internals, + deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors, temporal_layer_check=temporal_layer_check + ) + + if self.outputs is not None: + x = TensorDict(embedding=x) + x.update(((output, registered_tensors[output]) for output in self.outputs)) + + return x, internals @staticmethod - def from_json(filename): - """ - Creates a layer_networkd_builder from a JSON. - - Args: - filename: Path to configuration - - Returns: A layered_network_builder function with layers generated from the JSON - """ - path = os.path.join(os.getcwd(), filename) - with open(path, 'r') as fp: - config = json.load(fp=fp) - return LayeredNetwork(layers_spec=config) + def _recursive_apply( + *, layer, x, horizons, internals, deterministic, independent, registered_tensors, + temporal_layer_check + ): + if isinstance(layer, list): + for layer in layer: + x, temporal_layer_check = LayeredNetwork._recursive_apply( + layer=layer, x=x, horizons=horizons, internals=internals, + deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors, + temporal_layer_check=temporal_layer_check + ) + + elif isinstance(layer, Block): + for layer in layer.layers: + x, temporal_layer_check = LayeredNetwork._recursive_apply( + layer=layer, x=x, horizons=horizons, internals=internals, + deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors, + temporal_layer_check=temporal_layer_check + ) + + elif isinstance(layer, Reuse): + x, temporal_layer_check = LayeredNetwork._recursive_apply( + layer=layer.reused_layer, x=x, horizons=horizons, internals=internals, + deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors, + temporal_layer_check=temporal_layer_check + ) + + elif isinstance(layer, Register): + if layer.tensor in registered_tensors: + raise TensorforceError.exists(name='registered tensor', value=layer.tensor) + x = layer.apply(x=x) + registered_tensors[layer.tensor] = x + + elif isinstance(layer, MultiInputLayer): + if layer.tensors not in registered_tensors: + raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) + x = layer.apply(x=registered_tensors[layer.tensors]) + temporal_layer_check = False + + elif isinstance(layer, NondeterministicLayer): + x = layer.apply(x=x, deterministic=deterministic) + + elif isinstance(layer, StatefulLayer): + x = layer.apply(x=x, independent=independent) + + elif isinstance(layer, TemporalLayer): + if temporal_layer_check: + raise TensorforceError( + "Multiple successive temporal layers like RNNs are currently not supported." + ) + x, internals[layer.name] = layer.apply( + x=x, horizons=horizons, internals=internals[layer.name] + ) + temporal_layer_check = True + + else: + x = layer.apply(x=x) + + return x, temporal_layer_check diff --git a/tensorforce/core/networks/preprocessor.py b/tensorforce/core/networks/preprocessor.py new file mode 100644 index 000000000..81845ec37 --- /dev/null +++ b/tensorforce/core/networks/preprocessor.py @@ -0,0 +1,163 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import SignatureDict, TensorDict, TensorSpec, TensorsSpec, tf_function, \ + tf_util +from tensorforce.core.layers import MultiInputLayer, NondeterministicLayer, PreprocessingLayer, \ + Register, StatefulLayer, TemporalLayer +from tensorforce.core.networks import LayeredNetwork + + +class Preprocessor(LayeredNetwork): + """ + Special preprocessor network following a sequential layer-stack architecture, which can be + specified as either a single or a list of layer specifications. + + Args: + layers (iter[specification] | iter[iter[specification]]): Layers configuration, see the + [layers documentation](../modules/layers.html) + (required). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + is_preprocessing_layer_valid (bool): internal use. + name (string): internal use. + input_spec (specification): internal use. + """ + + def __init__( + self, *, layers, device=None, l2_regularization=None, is_preprocessing_layer_valid=True, + name=None, input_spec=None + ): + if not isinstance(input_spec, TensorSpec): + raise TensorforceError.type( + name='preprocessor', argument='inputs_spec', dtype=type(input_spec) + ) + + self.is_preprocessing_layer_valid = is_preprocessing_layer_valid + + super().__init__( + layers=layers, device=device, l2_regularization=l2_regularization, name=name, + inputs_spec=TensorsSpec(singleton=input_spec) + ) + + def invalid_layer_types(self): + if self.is_preprocessing_layer_valid: + return (TemporalLayer,) + else: + return (PreprocessingLayer, TemporalLayer) + + @property + def internals_spec(self): + raise NotImplementedError + + def internals_init(self): + raise NotImplementedError + + def max_past_horizon(self, *, on_policy): + raise NotImplementedError + + def past_horizon(self, *, on_policy): + raise NotImplementedError + + def input_signature(self, *, function): + if function == 'apply': + return SignatureDict( + x=self.inputs_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'reset': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'apply': + return SignatureDict(singleton=self.output_spec().signature(batched=True)) + + elif function == 'reset': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=0) + def reset(self): + operations = list(Preprocessor._recursive_reset(layer=self.layers)) + if len(operations) > 0: + return tf.math.reduce_any(input_tensor=tf.stack(values=operations, axis=0), axis=0) + else: + return tf_util.constant(value=False, dtype='bool') + + @staticmethod + def _recursive_reset(*, layer): + if isinstance(layer, list): + for layer in layer: + yield from Preprocessor._recursive_reset(layer=layer) + + elif isinstance(layer, PreprocessingLayer): + yield layer.reset() + + @tf_function(num_args=2) + def apply(self, *, x, deterministic, independent): + assert x.is_singleton() + x = x.singleton() + registered_tensors = TensorDict(input=x) + + x = Preprocessor._recursive_apply( + layer=self.layers, x=x, deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors + ) + + return x + + @staticmethod + def _recursive_apply(*, layer, x, deterministic, independent, registered_tensors): + if isinstance(layer, list): + for layer in layer: + x = Preprocessor._recursive_apply( + layer=layer, x=x, deterministic=deterministic, independent=independent, + registered_tensors=registered_tensors + ) + + elif isinstance(layer, Register): + if layer.tensor in registered_tensors: + raise TensorforceError.exists(name='registered tensor', value=layer.tensor) + x = layer.apply(x=x) + registered_tensors[layer.tensor] = x + + elif isinstance(layer, MultiInputLayer): + if layer.tensors not in registered_tensors: + raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) + x = layer.apply(x=registered_tensors[layer.tensors]) + + elif isinstance(layer, NondeterministicLayer): + x = layer.apply(x=x, deterministic=deterministic) + + elif isinstance(layer, StatefulLayer): + x = layer.apply(x=x, independent=independent) + + else: + x = layer.apply(x=x) + + return x diff --git a/tensorforce/core/objectives/__init__.py b/tensorforce/core/objectives/__init__.py new file mode 100644 index 000000000..e8854c9b8 --- /dev/null +++ b/tensorforce/core/objectives/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from functools import partial + +from tensorforce.core.objectives.objective import Objective + +from tensorforce.core.objectives.deterministic_policy_gradient import DeterministicPolicyGradient +from tensorforce.core.objectives.plus import Plus +from tensorforce.core.objectives.policy_gradient import PolicyGradient +from tensorforce.core.objectives.value import Value + + +objective_modules = dict( + action_value=partial(Value, value='action'), + deterministic_policy_gradient=DeterministicPolicyGradient, plus=Plus, + policy_gradient=PolicyGradient, state_value=partial(Value, value='state'), value=Value +) + + +__all__ = [ + 'DeterministicPolicyGradient', 'Objective', 'objective_modules', 'Plus', 'PolicyGradient', + 'Value' +] diff --git a/tensorforce/core/objectives/deterministic_policy_gradient.py b/tensorforce/core/objectives/deterministic_policy_gradient.py new file mode 100644 index 000000000..2a3ef5840 --- /dev/null +++ b/tensorforce/core/objectives/deterministic_policy_gradient.py @@ -0,0 +1,91 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.objectives import Objective + + +class DeterministicPolicyGradient(Objective): + """ + Deterministic policy gradient objective (specification key: `det_policy_gradient`). + + Args: + name (string): internal use. + states_spec (specification): internal use. + internals_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + reward_spec (specification): internal use. + """ + + def __init__( + self, *, name=None, states_spec=None, internals_spec=None, auxiliaries_spec=None, + actions_spec=None, reward_spec=None + ): + super().__init__( + name=name, states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + if not all(spec.type == 'float' for spec in self.actions_spec.values()): + raise TensorforceError.value( + name='DeterministicPolicyGradient', argument='actions', value=self.actions_spec, + hint='is not a float action' + ) + + def required_policy_fns(self): + return ('policy',) + + def required_baseline_fns(self): + return ('action_value',) + + def reference_spec(self): + # return self.actions_spec + return TensorSpec(type='float', shape=()) + + @tf_function(num_args=5) + def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): + # deterministic = tf_util.constant(value=True, dtype='bool') + # return policy.act( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + # deterministic=deterministic, independent=True + # ) + + return tf_util.zeros(shape=tf.shape(input=actions.value())[:1], dtype='float') + + def loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference, policy, + baseline + ): + # actions = self.reference( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + # actions=actions, policy=policy + # ) + + deterministic = tf_util.constant(value=True, dtype='bool') + actions, _ = policy.act( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + deterministic=deterministic, independent=True + ) + + action_value = baseline.action_value( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + + return -action_value diff --git a/tensorforce/core/objectives/objective.py b/tensorforce/core/objectives/objective.py new file mode 100644 index 000000000..03e1d9268 --- /dev/null +++ b/tensorforce/core/objectives/objective.py @@ -0,0 +1,98 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import Module, SignatureDict, TensorSpec, tf_function + + +class Objective(Module): + """ + Base class for optimization objectives. + + Args: + name (string): internal use. + states_spec (specification): internal use. + internals_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + reward_spec (specification): internal use. + """ + + def __init__( + self, *, name=None, states_spec=None, internals_spec=None, auxiliaries_spec=None, + actions_spec=None, reward_spec=None + ): + super().__init__(name=name) + + self.states_spec = states_spec + self.internals_spec = internals_spec + self.auxiliaries_spec = auxiliaries_spec + self.actions_spec = actions_spec + self.reward_spec = reward_spec + + def reference_spec(self): + raise NotImplementedError + + def required_policy_fns(self): + return () + + def required_baseline_fns(self): + return () + + def input_signature(self, *, function): + if function == 'loss': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + reward=self.reward_spec.signature(batched=True), + reference=self.reference_spec().signature(batched=True) + ) + + elif function == 'reference': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'loss': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + elif function == 'reference': + return SignatureDict(singleton=self.reference_spec().signature(batched=True)) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=5) + def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): + raise NotImplementedError + + @tf_function(num_args=7) + def loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference, policy, + baseline=None + ): + raise NotImplementedError diff --git a/tensorforce/core/objectives/plus.py b/tensorforce/core/objectives/plus.py new file mode 100644 index 000000000..e95113c3b --- /dev/null +++ b/tensorforce/core/objectives/plus.py @@ -0,0 +1,137 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import util +import tensorforce.core +from tensorforce.core import TensorSpec, tf_function, tf_util +from tensorforce.core.objectives import Objective + + +class Plus(Objective): + """ + Additive combination of two objectives (specification key: `plus`). + + Args: + objective1 (specification): First objective configuration + (required). + objective2 (specification): Second objective configuration + (required). + name (string): internal use. + states_spec (specification): internal use. + internals_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + reward_spec (specification): internal use. + """ + + def __init__( + self, *, objective1, objective2, name=None, states_spec=None, + internals_spec=None, auxiliaries_spec=None, actions_spec=None, reward_spec=None + ): + super().__init__( + name=name, states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + self.objective1 = self.submodule( + name='objective1', module=objective1, modules=tensorforce.core.objective_modules, + states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + self.objective2 = self.submodule( + name='objective2', module=objective2, modules=tensorforce.core.objective_modules, + states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + def __setattr__(self, name, value): + super().__setattr__(name, value) + + if hasattr(self, 'objective1') and name in ( + 'states_spec', 'internals_spec', 'auxiliaries_spec', 'actions_spec', 'reward_spec' + ): + self.objective1.__setattr__(name, value) + self.objective2.__setattr__(name, value) + + def required_policy_fns(self): + return self.objective1.required_policy_fns() + self.objective2.required_policy_fns() + + def required_baseline_fns(self): + return self.objective1.required_baseline_fns() + self.objective2.required_baseline_fns() + + def reference_spec(self): + reference_spec1 = self.objective1.reference_spec() + reference_spec2 = self.objective2.reference_spec() + assert reference_spec1.type == reference_spec2.type + shape = (reference_spec1.size + reference_spec2.size,) + return TensorSpec(type=reference_spec1.type, shape=shape) + + def optimizer_arguments(self, **kwargs): + arguments = super().optimizer_arguments() + util.deep_disjoint_update( + target=arguments, source=self.objective1.optimizer_arguments(**kwargs) + ) + util.deep_disjoint_update( + target=arguments, source=self.objective2.optimizer_arguments(**kwargs) + ) + return arguments + + @tf_function(num_args=5) + def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): + reference1 = self.objective1.reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, policy=policy + ) + + reference2 = self.objective2.reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, policy=policy + ) + + shape = (-1, self.objective1.reference_spec().size) + reference1 = tf.reshape(tensor=reference1, shape=shape) + shape = (-1, self.objective2.reference_spec().size) + reference2 = tf.reshape(tensor=reference2, shape=shape) + + return tf.concat(values=(reference1, reference2), axis=1) + + @tf_function(num_args=7) + def loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference, policy, + baseline=None + ): + reference_spec1 = self.objective1.reference_spec() + reference_spec2 = self.objective2.reference_spec() + assert tf_util.shape(x=reference)[1] == reference_spec1.size + reference_spec2.size + + reference1 = reference[:, :reference_spec1.size] + reference1 = tf.reshape(tensor=reference1, shape=((-1,) + reference_spec1.shape)) + reference2 = reference[:, reference_spec1.size:] + reference2 = tf.reshape(tensor=reference2, shape=((-1,) + reference_spec2.shape)) + + loss1 = self.objective1.loss( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, reward=reward, reference=reference1, policy=policy, baseline=baseline + ) + + loss2 = self.objective2.loss( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, reward=reward, reference=reference2, policy=policy, baseline=baseline + ) + + return loss1 + loss2 diff --git a/tensorforce/core/objectives/policy_gradient.py b/tensorforce/core/objectives/policy_gradient.py new file mode 100644 index 000000000..6d9b394ea --- /dev/null +++ b/tensorforce/core/objectives/policy_gradient.py @@ -0,0 +1,152 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf + +from tensorforce import util +from tensorforce.core import parameter_modules, TensorSpec, tf_function, tf_util +from tensorforce.core.objectives import Objective + + +class PolicyGradient(Objective): + """ + Policy gradient objective, which maximizes the log-likelihood or likelihood-ratio scaled by the + target reward value (specification key: `policy_gradient`). + + Args: + importance_sampling (bool): Whether to use the importance sampling version of the policy + gradient objective + (default: false). + clipping_value (parameter, float > 0.0): Clipping threshold for the maximized value + (default: no clipping). + early_reduce (bool): Whether to compute objective for aggregated likelihood instead of + likelihood per action (default: true). + name (string): internal use. + states_spec (specification): internal use. + internals_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + reward_spec (specification): internal use. + """ + + def __init__( + self, *, importance_sampling=False, clipping_value=None, early_reduce=True, name=None, + states_spec=None, internals_spec=None, auxiliaries_spec=None, actions_spec=None, + reward_spec=None + ): + super().__init__( + name=name, states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + self.importance_sampling = importance_sampling + + if clipping_value is None: + self.clipping_value = None + else: + self.clipping_value = self.submodule( + name='clipping_value', module=clipping_value, modules=parameter_modules, dtype='float', + min_value=0.0 + ) + + self.early_reduce = early_reduce + + def required_policy_fns(self): + return ('stochastic',) + + def reference_spec(self): + if self.early_reduce: + return TensorSpec(type='float', shape=()) + + else: + return TensorSpec( + type='float', shape=(sum(spec.size for spec in self.actions_spec.values()),) + ) + + @tf_function(num_args=5) + def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): + if self.early_reduce: + log_probability = policy.log_probability( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + + else: + log_probabilities = policy.log_probabilities( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + log_probabilities = log_probabilities.fmap( + function=function, zip_values=self.actions_spec + ) + log_probability = tf.concat(values=tuple(log_probabilities.values()), axis=1) + + return log_probability + + @tf_function(num_args=7) + def loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference, policy, + baseline=None + ): + log_probability = self.reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions, policy=policy + ) + + reference = tf.stop_gradient(input=reference) + + if self.importance_sampling: + log_ratio = log_probability - reference + # Clip log_ratio for numerical stability (epsilon < 1.0, hence negative) + log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') + log_ratio = tf.clip_by_value( + t=log_ratio, clip_value_min=log_epsilon, clip_value_max=-log_epsilon + ) + target = tf.math.exp(x=log_ratio) + else: + target = log_probability + + if not self.early_reduce: + reward = tf.expand_dims(input=reward, axis=1) + + if self.clipping_value is None: + scaled_target = target * reward + + else: + one = tf_util.constant(value=1.0, dtype='float') + clipping_value = one + self.clipping_value.value() + if self.importance_sampling: + min_value = tf.math.reciprocal(x=clipping_value) + max_value = clipping_value + else: + min_value = reference - tf.math.log(x=clipping_value) + max_value = reference + tf.math.log(x=clipping_value) + + clipped_target = tf.clip_by_value( + t=target, clip_value_min=min_value, clip_value_max=max_value + ) + scaled_target = tf.math.minimum(x=(target * reward), y=(clipped_target * reward)) + + loss = -scaled_target + + if not self.early_reduce: + loss = tf.math.reduce_sum(input_tensor=loss, axis=1) + + return loss diff --git a/tensorforce/core/objectives/value.py b/tensorforce/core/objectives/value.py new file mode 100644 index 000000000..d1bfc3ffa --- /dev/null +++ b/tensorforce/core/objectives/value.py @@ -0,0 +1,164 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import parameter_modules, TensorSpec, tf_function, tf_util +from tensorforce.core.objectives import Objective + + +class Value(Objective): + """ + Value approximation objective, which minimizes the L2-distance between the state-(action-)value + estimate and the target reward value + (specification key: `value`, `state_value`, `action_value`). + + Args: + value ("state" | "action"): Whether to approximate the state- or state-action-value + (required). + huber_loss (parameter, float > 0.0): Huber loss threshold + (default: no huber loss). + early_reduce (bool): Whether to compute objective for aggregated value instead of value per + action (default: true). + name (string): internal use. + states_spec (specification): internal use. + internals_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + reward_spec (specification): internal use. + """ + + def __init__( + self, *, value, huber_loss=None, early_reduce=True, name=None, states_spec=None, + internals_spec=None, auxiliaries_spec=None, actions_spec=None, reward_spec=None + ): + super().__init__( + name=name, states_spec=states_spec, internals_spec=internals_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec, reward_spec=reward_spec + ) + + assert value in ('state', 'action') + self.value = value + + if huber_loss is None: + self.huber_loss = None + else: + self.huber_loss = self.submodule( + name='huber_loss', module=huber_loss, modules=parameter_modules, dtype='float', + min_value=0.0 + ) + + self.early_reduce = early_reduce + + def required_policy_fns(self): + if self.value == 'state': + return ('state_value',) + elif self.value == 'action': + return ('action_value',) + + def reference_spec(self): + # if self.early_reduce: + return TensorSpec(type='float', shape=()) + + # else: + # return TensorSpec( + # type='float', shape=(sum(spec.size for spec in self.actions_spec.values()),) + # ) + + @tf_function(num_args=5) + def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): + # if self.value == 'state': + # if self.early_reduce: + # value = policy.state_value( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + # ) + # else: + # value = policy.state_values( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + # ) + # value = tf.concat(values=tuple(value.values()), axis=1) + + # elif self.value == 'action': + # if self.early_reduce: + # value = policy.action_value( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + # actions=actions + # ) + # else: + # value = policy.action_values( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + # actions=actions + # ) + # value = tf.concat(values=tuple(value.values()), axis=1) + + return tf_util.zeros(shape=tf.shape(input=actions.value())[:1], dtype='float') + + @tf_function(num_args=7) + def loss( + self, *, states, horizons, internals, auxiliaries, actions, reward, reference, policy, + baseline=None + ): + # value = self.reference( + # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + # actions=actions, policy=policy + # ) + + # reference = tf.stop_gradient(input=reference) + + if self.value == 'state': + if self.early_reduce: + value = policy.state_value( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + else: + value = policy.state_values( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + value = tf.concat(values=tuple(value.values()), axis=1) + + elif self.value == 'action': + if self.early_reduce: + value = policy.action_value( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + else: + value = policy.action_values( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + value = tf.concat(values=tuple(value.values()), axis=1) + + if not self.early_reduce: + reward = tf.expand_dims(input=reward, axis=1) + + difference = value - reward + + half = tf_util.constant(value=0.5, dtype='float') + + if self.huber_loss is None: + loss = half * tf.math.square(x=difference) + + else: + huber_loss = self.huber_loss.value() + inside_huber_bounds = tf.math.less_equal(x=tf.math.abs(x=difference), y=huber_loss) + quadratic = half * tf.math.square(x=difference) + linear = huber_loss * (tf.math.abs(x=difference) - half * huber_loss) + loss = tf.where(condition=inside_huber_bounds, x=quadratic, y=linear) + + if not self.early_reduce: + loss = tf.math.reduce_sum(input_tensor=loss, axis=1) + + return loss diff --git a/tensorforce/core/optimizers/__init__.py b/tensorforce/core/optimizers/__init__.py index a20c20977..4f4824bc3 100755 --- a/tensorforce/core/optimizers/__init__.py +++ b/tensorforce/core/optimizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,50 +13,43 @@ # limitations under the License. # ============================================================================== +from functools import partial + from tensorforce.core.optimizers.optimizer import Optimizer -from tensorforce.core.optimizers.meta_optimizer import MetaOptimizer -from tensorforce.core.optimizers.global_optimizer import GlobalOptimizer -from tensorforce.core.optimizers.tf_optimizer import TFOptimizer + +from tensorforce.core.optimizers.update_modifier import UpdateModifier + +from tensorforce.core.optimizers.clipping_step import ClippingStep from tensorforce.core.optimizers.evolutionary import Evolutionary -from tensorforce.core.optimizers.natural_gradient import NaturalGradient -from tensorforce.core.optimizers.clipped_step import ClippedStep +from tensorforce.core.optimizers.doublecheck_step import DoublecheckStep +from tensorforce.core.optimizers.global_optimizer import GlobalOptimizer +from tensorforce.core.optimizers.linesearch_step import LinesearchStep from tensorforce.core.optimizers.multi_step import MultiStep -from tensorforce.core.optimizers.optimized_step import OptimizedStep +from tensorforce.core.optimizers.natural_gradient import NaturalGradient +from tensorforce.core.optimizers.plus import Plus from tensorforce.core.optimizers.subsampling_step import SubsamplingStep from tensorforce.core.optimizers.synchronization import Synchronization +from tensorforce.core.optimizers.tf_optimizer import TFOptimizer, tensorflow_optimizers + +from tensorforce.core.optimizers.optimizer_wrapper import OptimizerWrapper -# This can register any class inheriting from tf.train.Optimizer -optimizers = dict( - global_optimizer=GlobalOptimizer, - adadelta=TFOptimizer.get_wrapper(optimizer='adadelta'), - adagrad=TFOptimizer.get_wrapper(optimizer='adagrad'), - adam=TFOptimizer.get_wrapper(optimizer='adam'), - nadam=TFOptimizer.get_wrapper(optimizer='nadam'), - gradient_descent=TFOptimizer.get_wrapper(optimizer='gradient_descent'), - momentum=TFOptimizer.get_wrapper(optimizer='momentum'), - rmsprop=TFOptimizer.get_wrapper(optimizer='rmsprop'), - evolutionary=Evolutionary, - natural_gradient=NaturalGradient, - clipped_step=ClippedStep, - multi_step=MultiStep, - optimized_step=OptimizedStep, - subsampling_step=SubsamplingStep, - synchronization=Synchronization +optimizer_modules = dict( + clipping_step=ClippingStep, default=OptimizerWrapper, doublecheck_step=DoublecheckStep, + evolutionary=Evolutionary, global_optimizer=GlobalOptimizer, linesearch_step=LinesearchStep, + multi_step=MultiStep, natural_gradient=NaturalGradient, optimizer_wrapper=OptimizerWrapper, + plus=Plus, subsampling_step=SubsamplingStep, synchronization=Synchronization, + tf_optimizer=TFOptimizer ) +for name, optimizer in tensorflow_optimizers.items(): + assert name not in optimizer_modules + optimizer_modules[name] = partial(TFOptimizer, optimizer=name) + + __all__ = [ - 'optimizers', - 'Optimizer', - 'MetaOptimizer', - 'GlobalOptimizer', - 'TFOptimizer', - 'Evolutionary', - 'NaturalGradient', - 'ClippedStep', - 'MultiStep', - 'OptimizedStep', - 'SubsamplingStep', - 'Synchronization' + 'ClippingStep', 'DoublecheckStep', 'Evolutionary', 'GlobalOptimizer', 'LinesearchStep', + 'MultiStep', 'NaturalGradient', 'Optimizer', 'optimizer_modules', 'Plus', 'SubsamplingStep', + 'Synchronization', 'TFOptimizer', 'UpdateModifier', 'UpdateModifierWrapper' ] diff --git a/tensorforce/core/optimizers/clipped_step.py b/tensorforce/core/optimizers/clipped_step.py deleted file mode 100755 index 0300fb81c..000000000 --- a/tensorforce/core/optimizers/clipped_step.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce.core.optimizers import MetaOptimizer - - -class ClippedStep(MetaOptimizer): - """ - The clipped-shep meta optimizer clips the values of the optimization step proposed by another - optimizer. - """ - - def __init__(self, optimizer, clipping_value, scope='clipped-step', summary_labels=()): - """ - Creates a new multi-step meta optimizer instance. - - Args: - optimizer: The optimizer which is modified by this meta optimizer. - clipping_value: Clip deltas at this value. - """ - assert isinstance(clipping_value, float) and clipping_value > 0.0 - self.clipping_value = clipping_value - - super(ClippedStep, self).__init__(optimizer=optimizer, scope=scope, summary_labels=summary_labels) - - def tf_step(self, time, variables, **kwargs): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - **kwargs: Additional arguments passed on to the internal optimizer. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - deltas = self.optimizer.step(time=time, variables=variables, **kwargs) - - with tf.control_dependencies(control_inputs=deltas): - clipped_deltas = list() - exceeding_deltas = list() - for delta in deltas: - clipped_delta = tf.clip_by_value( - t=delta, - clip_value_min=-self.clipping_value, - clip_value_max=self.clipping_value - ) - clipped_deltas.append(clipped_delta) - exceeding_deltas.append(clipped_delta - delta) - - applied = self.apply_step(variables=variables, deltas=exceeding_deltas) - - with tf.control_dependencies(control_inputs=(applied,)): - return [delta + 0.0 for delta in clipped_deltas] diff --git a/tensorforce/core/optimizers/clipping_step.py b/tensorforce/core/optimizers/clipping_step.py new file mode 100644 index 000000000..b21c0b3c0 --- /dev/null +++ b/tensorforce/core/optimizers/clipping_step.py @@ -0,0 +1,88 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import parameter_modules, tf_function, tf_util +from tensorforce.core.optimizers import UpdateModifier + + +class ClippingStep(UpdateModifier): + """ + Clipping-step update modifier, which clips the updates of the given optimizer (specification + key: `clipping_step`). + + Args: + optimizer (specification): Optimizer configuration + (required). + threshold (parameter, float > 0.0): Clipping threshold + (required). + mode ('global_norm' | 'norm' | 'value'): Clipping mode + (default: 'global_norm'). + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + def __init__(self, *, optimizer, threshold, mode='global_norm', name=None, arguments_spec=None): + super().__init__(optimizer=optimizer, name=name, arguments_spec=arguments_spec) + + self.threshold = self.submodule( + name='threshold', module=threshold, modules=parameter_modules, dtype='float', + min_value=0.0 + ) + + assert mode in ('global_norm', 'norm', 'value') + self.mode = mode + + def initialize(self): + super().initialize() + + self.register_summary(label='update-norm', name='unclipped-norm') + + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): + deltas = self.optimizer.step(arguments=arguments, variables=variables, **kwargs) + + with tf.control_dependencies(control_inputs=deltas): + threshold = self.threshold.value() + if self.mode == 'global_norm': + clipped_deltas, update_norm = tf.clip_by_global_norm( + t_list=deltas, clip_norm=threshold + ) + else: + clipped_deltas = list() + for delta in deltas: + if self.mode == 'norm': + clipped_delta = tf.clip_by_norm(t=delta, clip_norm=threshold) + elif self.mode == 'value': + clipped_delta = tf.clip_by_value( + t=delta, clip_value_min=-threshold, clip_value_max=threshold + ) + clipped_deltas.append(clipped_delta) + + def update_norm(): + return tf.linalg.global_norm(t_list=deltas) + + dependencies = self.summary( + label='update-norm', name='unclipped-norm', data=update_norm, step='updates' + ) + + for variable, delta, clipped_delta in zip(variables, deltas, clipped_deltas): + dependencies.append( + variable.assign_add(delta=(clipped_delta - delta), read_value=False) + ) + + with tf.control_dependencies(control_inputs=dependencies): + return [tf_util.identity(input=delta) for delta in clipped_deltas] diff --git a/tensorforce/core/optimizers/doublecheck_step.py b/tensorforce/core/optimizers/doublecheck_step.py new file mode 100644 index 000000000..49fc5ee66 --- /dev/null +++ b/tensorforce/core/optimizers/doublecheck_step.py @@ -0,0 +1,54 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import tf_function +from tensorforce.core.optimizers import UpdateModifier + + +class DoublecheckStep(UpdateModifier): + """ + Double-check update modifier, which checks whether the update of the given optimizer has + decreased the loss and otherwise reverses it (specification key: `doublecheck_step`). + + Args: + optimizer (specification): Optimizer configuration + (required). + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + @tf_function(num_args=1) + def step(self, *, arguments, variables, fn_loss, **kwargs): + loss_before = fn_loss(**arguments.to_kwargs()) + + with tf.control_dependencies(control_inputs=(loss_before,)): + deltas = self.optimizer.step( + arguments=arguments, variables=variables, fn_loss=fn_loss, **kwargs + ) + + with tf.control_dependencies(control_inputs=deltas): + loss_after = fn_loss(**arguments.to_kwargs()) + + def reverse_update(): + assignments = list() + for variable, delta in zip(variables, deltas): + assignments.append(variable.assign_add(delta=-delta, read_value=False)) + with tf.control_dependencies(control_inputs=assignments): + return [tf.zeros_like(input=delta) for delta in deltas] + + is_improvement = (loss_after < loss_before) + return tf.cond(pred=is_improvement, true_fn=(lambda: deltas), false_fn=reverse_update) diff --git a/tensorforce/core/optimizers/evolutionary.py b/tensorforce/core/optimizers/evolutionary.py index 6322eae51..0101369b0 100755 --- a/tensorforce/core/optimizers/evolutionary.py +++ b/tensorforce/core/optimizers/evolutionary.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,120 +13,145 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from six.moves import xrange import tensorflow as tf -from tensorforce import util +from tensorforce.core import parameter_modules, tf_function, tf_util from tensorforce.core.optimizers import Optimizer class Evolutionary(Optimizer): """ - Evolutionary optimizer which samples random perturbations and applies them either positively - or negatively, depending on their improvement of the loss. + Evolutionary optimizer, which samples random perturbations and applies them either as positive + or negative update depending on their improvement of the loss (specification key: + `evolutionary`). + + Args: + learning_rate (parameter, float > 0.0): Learning rate + (required). + num_samples (parameter, int >= 1): Number of sampled perturbations + (default: 1). + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, learning_rate, num_samples=1, unroll_loop=False, scope='evolutionary', summary_labels=()): - """ - Creates a new evolutionary optimizer instance. - - Args: - learning_rate: Learning rate. - num_samples: Number of sampled perturbations. - """ - assert isinstance(learning_rate, float) and learning_rate > 0.0 - self.learning_rate = learning_rate - - assert isinstance(num_samples, int) and num_samples > 0 - self.num_samples = num_samples - - assert isinstance(unroll_loop, bool) - self.unroll_loop = unroll_loop - - super(Evolutionary, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_step( - self, - time, - variables, - arguments, - fn_loss, - **kwargs - ): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - fn_loss: A callable returning the loss of the current model. - **kwargs: Additional arguments, not used. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - unperturbed_loss = fn_loss(**arguments) - - # First sample - perturbations = [tf.random_normal(shape=util.shape(variable)) * self.learning_rate for variable in variables] - applied = self.apply_step(variables=variables, deltas=perturbations) - - with tf.control_dependencies(control_inputs=(applied,)): - perturbed_loss = fn_loss(**arguments) - direction = tf.sign(x=(unperturbed_loss - perturbed_loss)) - deltas_sum = [direction * perturbation for perturbation in perturbations] - - if self.unroll_loop: - # Unrolled for loop - previous_perturbations = perturbations - for sample in xrange(self.num_samples): - - with tf.control_dependencies(control_inputs=deltas_sum): - perturbations = [tf.random_normal(shape=util.shape(variable)) * self.learning_rate for variable in variables] - perturbation_deltas = [ - pert - prev_pert for pert, prev_pert in zip(perturbations, previous_perturbations) - ] - applied = self.apply_step(variables=variables, deltas=perturbation_deltas) - previous_perturbations = perturbations - - with tf.control_dependencies(control_inputs=(applied,)): - perturbed_loss = fn_loss(**arguments) - direction = tf.sign(x=(unperturbed_loss - perturbed_loss)) - deltas_sum = [delta + direction * perturbation for delta, perturbation in zip(deltas_sum, perturbations)] + def __init__(self, *, learning_rate, num_samples=1, name=None, arguments_spec=None): + super().__init__(name=name, arguments_spec=arguments_spec) + + self.learning_rate = self.submodule( + name='learning_rate', module=learning_rate, modules=parameter_modules, dtype='float', + min_value=0.0 + ) + + if num_samples is None: + num_samples = 1 + self.num_samples = self.submodule( + name='num_samples', module=num_samples, modules=parameter_modules, dtype='int', + min_value=1 + ) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, fn_loss, **kwargs): + learning_rate = self.learning_rate.value() + + unperturbed_loss = fn_loss(**arguments.to_kwargs()) + + if self.num_samples.is_constant(value=1): + deltas = list() + for variable in variables: + delta = tf.random.normal(shape=variable.shape, dtype=variable.dtype) + if variable.dtype == tf_util.get_dtype(type='float'): + deltas.append(learning_rate * delta) + else: + deltas.append(tf.cast(x=learning_rate, dtype=variable.dtype) * delta) + + assignments = list() + for variable, delta in zip(variables, deltas): + assignments.append(variable.assign_add(delta=delta, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): + perturbed_loss = fn_loss(**arguments.to_kwargs()) + + def negate_deltas(): + neg_two_float = tf_util.constant(value=-2.0, dtype='float') + assignments = list() + for variable, delta in zip(variables, deltas): + if variable.dtype == tf_util.get_dtype(type='float'): + assignments.append( + variable.assign_add(delta=(neg_two_float * delta), read_value=False) + ) + else: + _ng_two_float = tf.constant(value=-2.0, dtype=variable.dtype) + assignments.append( + variable.assign_add(delta=(_ng_two_float * delta), read_value=False) + ) + + with tf.control_dependencies(control_inputs=assignments): + return [tf.math.negative(x=delta) for delta in deltas] + + return tf.cond( + pred=(perturbed_loss < unperturbed_loss), true_fn=(lambda: deltas), + false_fn=negate_deltas + ) else: - # TensorFlow while loop - def body(iteration, deltas_sum, previous_perturbations): + deltas = [tf.zeros_like(input=variable) for variable in variables] + previous_perturbations = [tf.zeros_like(input=variable) for variable in variables] + + def body(deltas, previous_perturbations): + with tf.control_dependencies(control_inputs=deltas): + perturbations = list() + for variable in variables: + perturbation = tf.random.normal(shape=variable.shape, dtype=variable.dtype) + if variable.dtype == tf_util.get_dtype(type='float'): + perturbations.append(learning_rate * perturbation) + else: + perturbations.append( + tf.cast(x=learning_rate, dtype=variable.dtype) * perturbation + ) - with tf.control_dependencies(control_inputs=deltas_sum): - perturbations = [tf.random_normal(shape=util.shape(variable)) * self.learning_rate for variable in variables] perturbation_deltas = [ - pert - prev_pert for pert, prev_pert in zip(perturbations, previous_perturbations) + pert - prev_pert + for pert, prev_pert in zip(perturbations, previous_perturbations) ] - applied = self.apply_step(variables=variables, deltas=perturbation_deltas) - - with tf.control_dependencies(control_inputs=(applied,)): - perturbed_loss = fn_loss(**arguments) - direction = tf.sign(x=(unperturbed_loss - perturbed_loss)) - deltas_sum = [delta + direction * perturbation for delta, perturbation in zip(deltas_sum, perturbations)] - - return iteration + 1, deltas_sum, perturbations - - def cond(iteration, deltas_sum, previous_perturbation): - return iteration < self.num_samples - 1 - - _, deltas_sum, perturbations = tf.while_loop(cond=cond, body=body, loop_vars=(0, deltas_sum, perturbations)) - - with tf.control_dependencies(control_inputs=deltas_sum): - deltas = [delta / self.num_samples for delta in deltas_sum] - perturbation_deltas = [delta - pert for delta, pert in zip(deltas, perturbations)] - applied = self.apply_step(variables=variables, deltas=perturbation_deltas) - - with tf.control_dependencies(control_inputs=(applied,)): - # Trivial operation to enforce control dependency - return [delta + 0.0 for delta in deltas] + assignments = list() + for variable, delta in zip(variables, perturbation_deltas): + assignments.append(variable.assign_add(delta=delta, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): + perturbed_loss = fn_loss(**arguments.to_kwargs()) + + one_float = tf_util.constant(value=1.0, dtype='float') + neg_one_float = tf_util.constant(value=-1.0, dtype='float') + direction = tf.where( + condition=(perturbed_loss < unperturbed_loss), x=one_float, y=neg_one_float + ) + + next_deltas = list() + for variable, delta, perturbation in zip(variables, deltas, perturbations): + if variable.dtype == tf_util.get_dtype(type='float'): + next_deltas.append(delta + direction * perturbation) + else: + next_deltas.append( + delta + tf.cast(x=direction, dtype=variable.dtype) * perturbation + ) + + return next_deltas, perturbations + + num_samples = self.num_samples.value() + deltas, perturbations = tf.while_loop( + cond=tf_util.always_true, body=body, loop_vars=(deltas, previous_perturbations), + maximum_iterations=tf_util.int32(x=num_samples) + ) + + with tf.control_dependencies(control_inputs=deltas): + num_samples = tf_util.cast(x=num_samples, dtype='float') + deltas = [delta / num_samples for delta in deltas] + + perturbation_deltas = [delta - pert for delta, pert in zip(deltas, perturbations)] + assignments = list() + for variable, delta in zip(variables, perturbation_deltas): + assignments.append(variable.assign_add(delta=delta, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): + # Trivial operation to enforce control dependency + return [tf_util.identity(input=delta) for delta in deltas] diff --git a/tensorforce/core/optimizers/global_optimizer.py b/tensorforce/core/optimizers/global_optimizer.py index c19a90abd..34e0278ed 100755 --- a/tensorforce/core/optimizers/global_optimizer.py +++ b/tensorforce/core/optimizers/global_optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,66 +13,55 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import tensorflow as tf -from tensorforce import util -from tensorforce.core.optimizers import MetaOptimizer +from tensorforce.core import tf_function, tf_util +from tensorforce.core.optimizers import UpdateModifier -class GlobalOptimizer(MetaOptimizer): +class GlobalOptimizer(UpdateModifier): """ - The global optimizer applies an optimizer to the local variables. In addition, it also - applies the update to a corresponding set of global variables and subsequently updates the local - variables to the value of these global variables. - Note: This is used for the current distributed mode, and will likely change with the next - major version update. + Global update modifier, which applies the given optimizer to the local variables, then applies + the update to a corresponding set of global variables, and subsequently updates the local + variables to the value of the global variables; will likely change in the future (specification + key: `global_optimizer`). + + Args: + optimizer (specification): Optimizer configuration + (required). + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, optimizer, scope='global-optimizer', summary_labels=()): - """ - Creates a new global optimizer instance. - - Args: - optimizer: The optimizer which is modified by this meta optimizer. - """ - super(GlobalOptimizer, self).__init__(optimizer=optimizer, scope=scope, summary_labels=summary_labels) - - def tf_step(self, time, variables, global_variables, **kwargs): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - global_variables: List of global variables to apply the proposed optimization step to. - **kwargs: ??? coming soon + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): + global_variables = kwargs['global_variables'] - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ assert all( - util.shape(global_variable) == util.shape(local_variable) + tf_util.shape(x=global_variable) == tf_util.shape(x=local_variable) for global_variable, local_variable in zip(global_variables, variables) ) - local_deltas = self.optimizer.step(time=time, variables=variables, **kwargs) + local_deltas = self.optimizer.step(arguments=arguments, variables=variables, **kwargs) with tf.control_dependencies(control_inputs=local_deltas): - applied = self.optimizer.apply_step(variables=global_variables, deltas=local_deltas) + assignments = list() + for variable, delta in zip(global_variables, local_deltas): + assignments.append(variable.assign_add(delta=delta, read_value=False)) - with tf.control_dependencies(control_inputs=(applied,)): + with tf.control_dependencies(control_inputs=assignments): update_deltas = list() for global_variable, local_variable in zip(global_variables, variables): - delta = global_variable - local_variable - update_deltas.append(delta) + update_deltas.append(global_variable - local_variable) - applied = self.apply_step(variables=variables, deltas=update_deltas) + assignments = list() + for variable, delta in zip(variables, update_deltas): + assignments.append(variable.assign_add(delta=delta, read_value=False)) # TODO: Update time, episode, etc (like in Synchronization)? - with tf.control_dependencies(control_inputs=(applied,)): - return [local_delta + update_delta for local_delta, update_delta in zip(local_deltas, update_deltas)] + with tf.control_dependencies(control_inputs=assignments): + return [ + local_delta + update_delta + for local_delta, update_delta in zip(local_deltas, update_deltas) + ] diff --git a/tensorforce/core/optimizers/linesearch_step.py b/tensorforce/core/optimizers/linesearch_step.py new file mode 100644 index 000000000..004b6627e --- /dev/null +++ b/tensorforce/core/optimizers/linesearch_step.py @@ -0,0 +1,98 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import TensorDict, TensorSpec, TensorsSpec, tf_function, tf_util +from tensorforce.core.optimizers import UpdateModifier +from tensorforce.core.optimizers.solvers import solver_modules + + +class LinesearchStep(UpdateModifier): + """ + Line-search-step update modifier, which performs a line search on the update step returned by + the given optimizer to find a potentially superior smaller step size + (specification key: `linesearch_step`). + + Args: + optimizer (specification): Optimizer configuration + (required). + max_iterations (parameter, int >= 1): Maximum number of line search iterations + (required). + backtracking_factor (parameter, 0.0 < float < 1.0): Line search backtracking factor + (default: 0.75). + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + def __init__( + self, *, optimizer, max_iterations, backtracking_factor=0.75, name=None, arguments_spec=None + ): + super().__init__(optimizer=optimizer, name=name, arguments_spec=arguments_spec) + + self.line_search = self.submodule( + name='line_search', module='line_search', modules=solver_modules, + max_iterations=max_iterations, backtracking_factor=backtracking_factor + ) + + def initialize_given_variables(self, *, variables): + super().initialize_given_variables(variables=variables) + + self.line_search.complete_initialize( + arguments_spec=self.arguments_spec, values_spec=self.variables_spec + ) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, fn_loss, **kwargs): + loss_before = fn_loss(**arguments.to_kwargs()) + + with tf.control_dependencies(control_inputs=(loss_before,)): + deltas = self.optimizer.step( + arguments=arguments, variables=variables, fn_loss=fn_loss, **kwargs + ) + + with tf.control_dependencies(control_inputs=deltas): + + def linesearch(): + loss_after = fn_loss(**arguments.to_kwargs()) + + with tf.control_dependencies(control_inputs=(loss_after,)): + # Replace "/" with "_" to ensure TensorDict is flat + _deltas = TensorDict(( + (var.name[:-2].replace('/', '_'), delta) + for var, delta in zip(variables, deltas) + )) + + # TODO: should be moved to initialize_given_variables, but fn_loss... + def evaluate_step(arguments, deltas): + assignments = list() + for variable, delta in zip(variables, deltas.values()): + assignments.append(variable.assign_add(delta=delta, read_value=False)) + with tf.control_dependencies(control_inputs=assignments): + return fn_loss(**arguments.to_kwargs()) + + _deltas = self.line_search.solve( + arguments=arguments, x_init=_deltas, base_value=loss_before, + zero_value=loss_after, fn_x=evaluate_step + ) + return tuple(_deltas.values()) + + num_nonzero = list() + for delta in deltas: + num_nonzero.append(tf.math.count_nonzero(input=delta)) + num_nonzero = tf.math.add_n(inputs=num_nonzero) + + return tf.cond(pred=(num_nonzero == 0), true_fn=(lambda: deltas), false_fn=linesearch) diff --git a/tensorforce/core/optimizers/meta_optimizer.py b/tensorforce/core/optimizers/meta_optimizer.py deleted file mode 100755 index 2229ec350..000000000 --- a/tensorforce/core/optimizers/meta_optimizer.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.core.optimizers import Optimizer - - -class MetaOptimizer(Optimizer): - """ - A meta optimizer takes the optimization implemented by another optimizer and - modifies/optimizes its proposed result. For example, line search might be applied to find a - more optimal step size. - """ - - def __init__(self, optimizer, scope='meta-optimizer', summary_labels=(), **kwargs): - """ - Creates a new meta optimizer instance. - - Args: - optimizer: The optimizer which is modified by this meta optimizer. - """ - self.optimizer = Optimizer.from_spec(spec=optimizer, kwargs=kwargs) - - super(MetaOptimizer, self).__init__(scope=scope, summary_labels=summary_labels) - - def get_variables(self): - return super(MetaOptimizer, self).get_variables() + self.optimizer.get_variables() diff --git a/tensorforce/core/optimizers/multi_step.py b/tensorforce/core/optimizers/multi_step.py index dea38fd49..bb579921b 100755 --- a/tensorforce/core/optimizers/multi_step.py +++ b/tensorforce/core/optimizers/multi_step.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,79 +13,50 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from six.moves import xrange import tensorflow as tf -from tensorforce.core.optimizers import MetaOptimizer +from tensorforce.core import parameter_modules, tf_function, tf_util +from tensorforce.core.optimizers import UpdateModifier -class MultiStep(MetaOptimizer): +class MultiStep(UpdateModifier): """ - The multi-step meta optimizer repeatedly applies the optimization step proposed by another - optimizer a number of times. + Multi-step update modifier, which applies the given optimizer for a number of times + (specification key: `multi_step`). + + Args: + optimizer (specification): Optimizer configuration + (required). + num_steps (parameter, int >= 1): Number of optimization steps + (required). + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, optimizer, num_steps=10, unroll_loop=False, scope='multi-step', summary_labels=()): - """ - Creates a new multi-step meta optimizer instance. - - Args: - optimizer: The optimizer which is modified by this meta optimizer. - num_steps: Number of optimization steps to perform. - """ - assert isinstance(num_steps, int) and num_steps > 0 - self.num_steps = num_steps - - assert isinstance(unroll_loop, bool) - self.unroll_loop = unroll_loop - - super(MultiStep, self).__init__(optimizer=optimizer, scope=scope, summary_labels=summary_labels) - - def tf_step(self, time, variables, arguments, fn_reference=None, **kwargs): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - fn_reference: A callable returning the reference values, in case of a comparative loss. - **kwargs: Additional arguments passed on to the internal optimizer. + def __init__(self, *, optimizer, num_steps, name=None, arguments_spec=None): + super().__init__(optimizer=optimizer, name=name, arguments_spec=arguments_spec) - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ + self.num_steps = self.submodule( + name='num_steps', module=num_steps, modules=parameter_modules, dtype='int', + min_value=1 + ) - # Set reference to compare with at each optimization step, in case of a comparative loss. - arguments['reference'] = fn_reference(**arguments) - - # First step - deltas = self.optimizer.step(time=time, variables=variables, arguments=arguments, **kwargs) - - if self.unroll_loop: - # Unrolled for loop - for _ in xrange(self.num_steps - 1): - with tf.control_dependencies(control_inputs=deltas): - step_deltas = self.optimizer.step(time=time, variables=variables, arguments=arguments, **kwargs) - deltas = [delta1 + delta2 for delta1, delta2 in zip(deltas, step_deltas)] + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): + deltas = [tf.zeros_like(input=variable) for variable in variables] + def body(*deltas): + with tf.control_dependencies(control_inputs=deltas): + step_deltas = self.optimizer.step( + arguments=arguments, variables=variables, **kwargs + ) + deltas = [delta1 + delta2 for delta1, delta2 in zip(deltas, step_deltas)] return deltas - else: - # TensorFlow while loop - def body(iteration, deltas): - with tf.control_dependencies(control_inputs=deltas): - step_deltas = self.optimizer.step(time=time, variables=variables, arguments=arguments, **kwargs) - deltas = [delta1 + delta2 for delta1, delta2 in zip(deltas, step_deltas)] - return iteration + 1, deltas - - def cond(iteration, deltas): - return iteration < self.num_steps - 1 + num_steps = self.num_steps.value() + deltas = tf.while_loop( + cond=tf_util.always_true, body=body, loop_vars=deltas, + maximum_iterations=tf_util.int32(x=num_steps) + ) - _, deltas = tf.while_loop(cond=cond, body=body, loop_vars=(0, deltas)) - - return deltas + return deltas diff --git a/tensorforce/core/optimizers/natural_gradient.py b/tensorforce/core/optimizers/natural_gradient.py index e31ba3606..675365145 100755 --- a/tensorforce/core/optimizers/natural_gradient.py +++ b/tensorforce/core/optimizers/natural_gradient.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,155 +13,186 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +import functools import tensorflow as tf +from tensorforce import util +from tensorforce.core import parameter_modules, TensorDict, tf_function, tf_util from tensorforce.core.optimizers import Optimizer -from tensorforce.core.optimizers.solvers import ConjugateGradient +from tensorforce.core.optimizers.solvers import solver_modules class NaturalGradient(Optimizer): """ - Natural gradient optimizer. + Natural gradient optimizer (specification key: `natural_gradient`). + + Args: + learning_rate (parameter, float > 0.0): Learning rate as KL-divergence of distributions + between optimization steps + (required). + cg_max_iterations (int >= 1): Maximum number of conjugate gradient iterations. + (default: 10). + cg_damping (0.0 <= float <= 1.0): Conjugate gradient damping factor. + (default: 0.1). + only_positive_updates (bool): Whether to only perform updates with positive improvement + estimate + (default: true). + name (string): (internal use). + arguments_spec (specification): internal use. """ def __init__( - self, - learning_rate, - cg_max_iterations=20, - cg_damping=1e-3, - cg_unroll_loop=False, - scope='natural-gradient', - summary_labels=() + self, *, learning_rate, cg_max_iterations=10, cg_damping=0.1, only_positive_updates=True, + name=None, arguments_spec=None ): - """ - Creates a new natural gradient optimizer instance. - - Args: - learning_rate: Learning rate, i.e. KL-divergence of distributions between optimization steps. - cg_max_iterations: Conjugate gradient solver max iterations. - cg_damping: Conjugate gradient solver damping factor. - cg_unroll_loop: Unroll conjugate gradient loop if true. - """ - assert learning_rate > 0.0 - self.learning_rate = learning_rate - - self.solver = ConjugateGradient( - max_iterations=cg_max_iterations, - damping=cg_damping, - unroll_loop=cg_unroll_loop + super().__init__(name=name, arguments_spec=arguments_spec) + + self.learning_rate = self.submodule( + name='learning_rate', module=learning_rate, modules=parameter_modules, dtype='float', + min_value=0.0 ) - super(NaturalGradient, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_step( - self, - time, - variables, - arguments, - fn_loss, - fn_kl_divergence, - return_estimated_improvement=False, - **kwargs - ): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - fn_loss: A callable returning the loss of the current model. - fn_kl_divergence: A callable returning the KL-divergence relative to the current model. - return_estimated_improvement: Returns the estimated improvement resulting from the - natural gradient calculation if true. - **kwargs: Additional arguments, not used. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ + self.conjugate_gradient = self.submodule( + name='conjugate_gradient', module='conjugate_gradient', modules=solver_modules, + max_iterations=cg_max_iterations, damping=cg_damping + ) + self.only_positive_updates = only_positive_updates + + def initialize_given_variables(self, *, variables): + super().initialize_given_variables(variables=variables) + + self.conjugate_gradient.complete_initialize( + arguments_spec=self.arguments_spec, values_spec=self.variables_spec + ) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, fn_loss, **kwargs): # Optimize: argmin(w) loss(w + delta) such that kldiv(P(w) || P(w + delta)) = learning_rate # For more details, see our blogpost: # https://reinforce.io/blog/end-to-end-computation-graphs-for-reinforcement-learning/ - # from tensorforce import util - # arguments = util.map_tensors(fn=tf.stop_gradient, tensors=arguments) - - # kldiv - kldiv = fn_kl_divergence(**arguments) - - # grad(kldiv) - kldiv_gradients = tf.gradients(ys=kldiv, xs=variables) + fn_kl_divergence = kwargs['fn_kl_divergence'] + # TODO: should be moved to initialize_given_variables, but fn_kl_divergence... # Calculates the product x * F of a given vector x with the fisher matrix F. # Incorporating the product prevents having to calculate the entire matrix explicitly. - def fisher_matrix_product(deltas): - # Gradient is not propagated through solver. - deltas = [tf.stop_gradient(input=delta) for delta in deltas] - - # delta' * grad(kldiv) - delta_kldiv_gradients = tf.add_n(inputs=[ - tf.reduce_sum(input_tensor=(delta * grad)) for delta, grad in zip(deltas, kldiv_gradients) - ]) + def fisher_matrix_product(arguments, deltas): + # Second-order gradients + with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape1: + for variable in variables: + tape1.watch(tensor=variable) + with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape2: + for variable in variables: + tape2.watch(tensor=variable) + + # kldiv + kldiv = fn_kl_divergence(**arguments.to_kwargs()) + + # grad(kldiv) + kldiv_grads = tape2.gradient(target=kldiv, sources=variables) + kldiv_grads = [ + tf.zeros_like(input=var) if grad is None else grad + for var, grad in zip(variables, kldiv_grads) + ] + + # delta' * grad(kldiv) + multiply = functools.partial( + tf_util.lift_indexedslices, tf.math.multiply, + with_assertions=self.config.create_tf_assertions + ) + delta_kldiv_grads = tf.math.add_n(inputs=[ + tf.math.reduce_sum(input_tensor=multiply(delta, grad)) + for delta, grad in zip(deltas.values(), kldiv_grads) + ]) # [delta' * F] = grad(delta' * grad(kldiv)) - return tf.gradients(ys=delta_kldiv_gradients, xs=variables) + delta_kldiv_grads2 = tape1.gradient(target=delta_kldiv_grads, sources=variables) + return TensorDict(( + (var.name[:-2].replace('/', '_'), tf.zeros_like(input=var) if x is None else x) + for var, x in zip(variables, delta_kldiv_grads2) + )) # loss - loss = fn_loss(**arguments) + with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape: + for variable in variables: + tape.watch(tensor=variable) + loss = fn_loss(**arguments.to_kwargs()) # grad(loss) - loss_gradients = tf.gradients(ys=loss, xs=variables) + loss_gradients = tape.gradient(target=loss, sources=variables) + loss_gradients = [ + tf.zeros_like(input=var) if grad is None else grad + for var, grad in zip(variables, loss_gradients) + ] # Solve the following system for delta' via the conjugate gradient solver. # [delta' * F] * delta' = -grad(loss) # --> delta' (= lambda * delta) - deltas = self.solver.solve(fn_x=fisher_matrix_product, x_init=None, b=[-grad for grad in loss_gradients]) + # Replace "/" with "_" to ensure TensorDict is flat + x_init = TensorDict( + ((var.name[:-2].replace('/', '_'), tf.zeros_like(input=var)) for var in variables) + ) + b = TensorDict( + ((var.name[:-2].replace('/', '_'), -x) for var, x in zip(variables, loss_gradients)) + ) + deltas = self.conjugate_gradient.solve( + arguments=arguments, x_init=x_init, b=b, fn_x=fisher_matrix_product + ) # delta' * F - delta_fisher_matrix_product = fisher_matrix_product(deltas=deltas) + delta_fisher_matrix_product = fisher_matrix_product(arguments=arguments, deltas=deltas) - # c' = 0.5 * delta' * F * delta' (= lambda * c) + # c' = 0.5 * delta' * F * delta' (= lambda^2 * c) # TODO: Why constant and hence KL-divergence sometimes negative? - constant = 0.5 * tf.add_n(inputs=[ - tf.reduce_sum(input_tensor=(delta_F * delta)) - for delta_F, delta in zip(delta_fisher_matrix_product, deltas) + delta_F_delta = delta_fisher_matrix_product.fmap( + function=(lambda delta_F, delta: delta_F * delta), zip_values=deltas + ) + half = tf_util.constant(value=0.5, dtype='float') + constant = half * tf.math.add_n(inputs=[ + tf.math.reduce_sum(input_tensor=x) for x in delta_F_delta.values() ]) + learning_rate = self.learning_rate.value() + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + # Zero step if constant <= 0 + def no_step(): + return [tf.zeros_like(input=delta) for delta in deltas.values()] + # Natural gradient step if constant > 0 - def natural_gradient_step(): + def apply_step(): # lambda = sqrt(c' / c) - lagrange_multiplier = tf.sqrt(x=(constant / self.learning_rate)) + lagrange_multiplier = tf.where( + condition=(constant > 0.0), + x=tf.math.sqrt(x=constant), y=-tf.math.sqrt(x=-constant) + ) / (learning_rate + epsilon) # delta = delta' / lambda - estimated_deltas = [delta / lagrange_multiplier for delta in deltas] - - # improvement = grad(loss) * delta (= loss_new - loss_old) - estimated_improvement = tf.add_n(inputs=[ - tf.reduce_sum(input_tensor=(grad * delta)) - for grad, delta in zip(loss_gradients, estimated_deltas) - ]) + lagrange_multiplier = tf.where( + condition=(lagrange_multiplier > 0.0), + x=(lagrange_multiplier + epsilon), y=(lagrange_multiplier - epsilon) + ) + estimated_deltas = deltas.fmap(function=(lambda delta: delta / lagrange_multiplier)) # Apply natural gradient improvement. - applied = self.apply_step(variables=variables, deltas=estimated_deltas) - - with tf.control_dependencies(control_inputs=(applied,)): - # Trivial operation to enforce control dependency - if return_estimated_improvement: - return [estimated_delta + 0.0 for estimated_delta in estimated_deltas], estimated_improvement - else: - return [estimated_delta + 0.0 for estimated_delta in estimated_deltas] - - # Zero step if constant <= 0 - def zero_step(): - if return_estimated_improvement: - return [tf.zeros_like(tensor=delta) for delta in deltas], 0.0 - else: - return [tf.zeros_like(tensor=delta) for delta in deltas] - - # Natural gradient step only works if constant > 0 - return tf.cond(pred=(constant > 0.0), true_fn=natural_gradient_step, false_fn=zero_step) + assignments = list() + for variable, delta in zip(variables, estimated_deltas.values()): + assignments.append(variable.assign_add(delta=delta, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): + # # improvement = grad(loss) * delta (= loss_new - loss_old) + # improvement_estimate = tf.math.add_n(inputs=[ + # tf.math.reduce_sum(input_tensor=(loss_grad * delta)) + # for loss_grad, delta in zip(loss_gradients, estimated_deltas.values()) + # ]) + return [tf_util.identity(input=delta) for delta in estimated_deltas.values()] + + if self.only_positive_updates: + # Natural gradient step only works if constant > 0 (epsilon to avoid zero division) + skip_step = constant < (epsilon * learning_rate) + return tf.cond(pred=skip_step, true_fn=no_step, false_fn=apply_step) + + else: + return apply_step() diff --git a/tensorforce/core/optimizers/optimized_step.py b/tensorforce/core/optimizers/optimized_step.py deleted file mode 100755 index be707ced0..000000000 --- a/tensorforce/core/optimizers/optimized_step.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import TensorForceError -from tensorforce.core.optimizers import MetaOptimizer -from tensorforce.core.optimizers.solvers import LineSearch - - -class OptimizedStep(MetaOptimizer): - """ - The optimized-step meta optimizer applies line search to the proposed optimization step of - another optimizer to find a more optimal step size. - """ - - def __init__( - self, - optimizer, - ls_max_iterations=10, - ls_accept_ratio=0.9, - ls_mode='exponential', - ls_parameter=0.5, - ls_unroll_loop=False, - scope='optimized-step', - summary_labels=() - ): - """ - Creates a new optimized step meta optimizer instance. - - Args: - optimizer: The optimizer which is modified by this meta optimizer. - ls_max_iterations: Maximum number of line search iterations. - ls_accept_ratio: Line search acceptance ratio. - ls_mode: Line search mode, see LineSearch solver. - ls_parameter: Line search parameter, see LineSearch solver. - ls_unroll_loop: Unroll line search loop if true. - """ - self.solver = LineSearch( - max_iterations=ls_max_iterations, - accept_ratio=ls_accept_ratio, - mode=ls_mode, - parameter=ls_parameter, - unroll_loop=ls_unroll_loop - ) - - super(OptimizedStep, self).__init__(optimizer=optimizer, scope=scope, summary_labels=summary_labels) - - def tf_step( - self, - time, - variables, - arguments, - fn_loss, - fn_reference, - **kwargs - ): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - fn_loss: A callable returning the loss of the current model. - fn_reference: A callable returning the reference values, in case of a comparative loss. - **kwargs: Additional arguments passed on to the internal optimizer. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - - # Set reference to compare with at each optimization step, in case of a comparative loss. - arguments['reference'] = fn_reference(**arguments) - - # Negative value since line search maximizes. - loss_before = -fn_loss(**arguments) - - with tf.control_dependencies(control_inputs=(loss_before,)): - deltas = self.optimizer.step( - time=time, - variables=variables, - arguments=arguments, - fn_loss=fn_loss, - return_estimated_improvement=True, - **kwargs - ) - - if isinstance(deltas, tuple): - # If 'return_estimated_improvement' argument exists. - if len(deltas) != 2: - raise TensorForceError("Unexpected output of internal optimizer.") - deltas, estimated_improvement = deltas - # Negative value since line search maximizes. - estimated_improvement = -estimated_improvement - else: - estimated_improvement = None - - with tf.control_dependencies(control_inputs=deltas): - # Negative value since line search maximizes. - loss_step = -fn_loss(**arguments) - - with tf.control_dependencies(control_inputs=(loss_step,)): - - def evaluate_step(deltas): - with tf.control_dependencies(control_inputs=deltas): - applied = self.apply_step(variables=variables, deltas=deltas) - with tf.control_dependencies(control_inputs=(applied,)): - # Negative value since line search maximizes. - return -fn_loss(**arguments) - - return self.solver.solve( - fn_x=evaluate_step, - x_init=deltas, - base_value=loss_before, - target_value=loss_step, - estimated_improvement=estimated_improvement - ) diff --git a/tensorforce/core/optimizers/optimizer.py b/tensorforce/core/optimizers/optimizer.py index 950b3a996..e76e0fa5d 100755 --- a/tensorforce/core/optimizers/optimizer.py +++ b/tensorforce/core/optimizers/optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,163 +13,162 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import tensorflow as tf -from tensorforce import util, TensorForceError -import tensorforce.core.optimizers +from tensorforce.core import Module, SignatureDict, TensorSpec, TensorsSpec, tf_function, tf_util -class Optimizer(object): +class Optimizer(Module): """ - Base class for optimizers which minimize a not yet further specified expression, usually some - kind of loss function. More generally, an optimizer can be considered as some method of - updating a set of variables. + Base class for optimizers. + + Args: + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, scope='optimizer', summary_labels=None): - """ - Creates a new optimizer instance. - """ - self.summary_labels = set(summary_labels or ()) - - self.variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - assert kwargs.get('trainable', False) - self.variables[name] = variable - return variable - - # TensorFlow function - self.step = tf.make_template( - name_=(scope + '/step'), - func_=self.tf_step, - custom_getter=custom_getter - ) - - def tf_step(self, time, variables, **kwargs): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - **kwargs: Additional arguments depending on the specific optimizer implementation. - For instance, often includes `fn_loss` if a loss function is optimized. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ + def __init__(self, *, name=None, arguments_spec=None): + super().__init__(name=name) + + self.arguments_spec = arguments_spec + + self.is_initialized_given_variables = False + + def initialize_given_variables(self, *, variables): + assert not self.root.is_initialized and not self.is_initialized_given_variables + + for module in self.this_submodules: + if isinstance(module, Optimizer): + module.initialize_given_variables(variables=variables) + + # Replace "/" with "_" to ensure TensorDict is flat + self.variables_spec = TensorsSpec(((var.name[:-2].replace('/', '_'), TensorSpec( + type=tf_util.dtype(x=var, fallback_tf_dtype=True), shape=tf_util.shape(x=var) + )) for var in variables)) + + self.is_initialized_given_variables = True + + if self.config.create_debug_assertions: + self.is_initialized = False + for variable in variables: + self.zero_check_history = self.variable( + name='zero_check_history', + spec=TensorSpec(type='bool', shape=(3, len(variables))), + initializer='zeros', is_trainable=False, is_saved=False + ) + self.zero_check_index = self.variable( + name='zero_check_index', spec=TensorSpec(type='int', shape=()), + initializer='zeros', is_trainable=False, is_saved=False + ) + self.is_initialized = True + + def input_signature(self, *, function): + if function == 'step' or function == 'update': + return SignatureDict(arguments=self.arguments_spec.signature(batched=True)) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'step': + return self.variables_spec.fmap( + function=(lambda spec: spec.signature(batched=False)), cls=SignatureDict + ) + + elif function == 'update': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): raise NotImplementedError - def apply_step(self, variables, deltas): - """ - Applies step deltas to variable values. - - Args: - variables: List of variables. - deltas: List of deltas of same length. - - Returns: - The step-applied operation. - """ - if len(variables) != len(deltas): - raise TensorForceError("Invalid variables and deltas lists.") - return tf.group( - *(tf.assign_add(ref=variable, value=delta) for variable, delta in zip(variables, deltas)) - ) - - def minimize(self, time, variables, **kwargs): - """ - Performs an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - **kwargs: Additional optimizer-specific arguments. The following arguments are used - by some optimizers: - - arguments: Dict of arguments for callables, like fn_loss. - - fn_loss: A callable returning the loss of the current model. - - fn_reference: A callable returning the reference values, in case of a comparative - loss. - - fn_kl_divergence: A callable returning the KL-divergence relative to the - current model. - - return_estimated_improvement: Returns the estimated improvement resulting from - the natural gradient calculation if true. - - source_variables: List of source variables to synchronize with. - - global_variables: List of global variables to apply the proposed optimization - step to. - - - Returns: - The optimization operation. - """ - # # Add training variable gradient histograms/scalars to summary output - # # if 'gradients' in self.summary_labels: - # if any(k in self.summary_labels for k in ['gradients', 'gradients_histogram', 'gradients_scalar']): - # valid = True - # if isinstance(self, tensorforce.core.optimizers.TFOptimizer): - # gradients = self.optimizer.compute_gradients(kwargs['fn_loss']()) - # elif isinstance(self.optimizer, tensorforce.core.optimizers.TFOptimizer): - # # This section handles "Multi_step" and may handle others - # # if failure is found, add another elif to handle that case - # gradients = self.optimizer.optimizer.compute_gradients(kwargs['fn_loss']()) - # else: - # # Didn't find proper gradient information - # valid = False - - # # Valid gradient data found, create summary data items - # if valid: - # for grad, var in gradients: - # if grad is not None: - # if any(k in self.summary_labels for k in ('gradients', 'gradients_scalar')): - # axes = list(range(len(grad.shape))) - # mean, var = tf.nn.moments(grad, axes) - # summary = tf.summary.scalar(name='gradients/' + var.name + "/mean", tensor=mean) - # self.summaries.append(summary) - # summary = tf.summary.scalar(name='gradients/' + var.name + "/variance", tensor=var) - # self.summaries.append(summary) - # if any(k in self.summary_labels for k in ('gradients', 'gradients_histogram')): - # summary = tf.summary.histogram(name='gradients/' + var.name, values=grad) - # self.summaries.append(summary) - - deltas = self.step(time=time, variables=variables, **kwargs) - with tf.control_dependencies(control_inputs=deltas): - return tf.no_op() - - def get_variables(self): - """ - Returns the TensorFlow variables used by the optimizer. - - Returns: - List of variables. - """ - return [self.variables[key] for key in sorted(self.variables)] - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the optimizer. - - Returns: - List of summaries. - """ - return self.summaries - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates an optimizer from a specification dict. - """ - optimizer = util.get_object( - obj=spec, - predefined_objects=tensorforce.core.optimizers.optimizers, - kwargs=kwargs - ) - assert isinstance(optimizer, Optimizer) - return optimizer + @tf_function(num_args=1) + def update(self, *, arguments, variables, **kwargs): + assert self.is_initialized_given_variables + assert all(variable.dtype.is_floating for variable in variables) + + deltas = self.step(arguments=arguments, variables=variables, **kwargs) + + operations = list(deltas) + if self.config.create_debug_assertions: + from tensorforce.core.optimizers import DoublecheckStep, NaturalGradient, \ + Synchronization, UpdateModifier + optimizer = self + while isinstance(optimizer, UpdateModifier): + if isinstance(optimizer, DoublecheckStep): + break + optimizer = optimizer.optimizer + if not isinstance(optimizer, DoublecheckStep) and ( + not isinstance(optimizer, NaturalGradient) or not optimizer.only_positive_updates + ) and (not isinstance(self, Synchronization) or self.sync_frequency is None): + false = tf_util.constant(value=False, dtype='bool') + zero = tf_util.constant(value=0, dtype='int') + one = tf_util.constant(value=1, dtype='int') + zero_float = tf_util.constant(value=0.0, dtype='float') + y = tf.reduce_any(input_tensor=tf.math.not_equal( + x=arguments['reward'], y=zero_float + )) + for index, (delta, variable) in enumerate(zip(deltas, variables)): + if '_distribution/mean/linear/' in variable.name: + # Gaussian.state_value does not use mean + continue + is_zero = tf.math.logical_and( + x=tf.math.equal(x=tf.math.count_nonzero( + input=delta, dtype=tf_util.get_dtype(type='int') + ), y=zero), + y=y + ) + index = tf_util.constant(value=index, dtype='int', shape=(1,)) + index = tf.stack(values=( + tf.expand_dims(input=self.zero_check_index, axis=0), index + ), axis=1) + operations.append(tf.tensor_scatter_nd_update( + tensor=self.zero_check_history, indices=index, + updates=tf.expand_dims(input=is_zero, axis=0) + )) + + operations.append(tf.debugging.assert_equal( + x=tf.math.reduce_any(input_tensor=tf.math.reduce_all( + input_tensor=self.zero_check_history, axis=1 + ), axis=0), y=false + )) + with tf.control_dependencies(control_inputs=operations): + operations = [self.zero_check_index.assign(value=tf.math.mod(x=one, y=3))] + + with tf.control_dependencies(control_inputs=operations): + dependencies = list() + + if self.root.summaries == 'all' or 'update-norm' in self.root.summaries: + with self.root.summarizer.as_default(): + x = tf.linalg.global_norm( + t_list=[tf_util.cast(x=delta, dtype='float') for delta in deltas] + ) + dependencies.append( + tf.summary.scalar(name='update-norm', data=x, step=self.root.updates) + ) + + if self.root.summaries == 'all' or 'updates' in self.root.summaries: + with self.root.summarizer.as_default(): + for var in variables: + assert var.name[-2] == ':' + if var.name.startswith(self.root.name + '/'): + mean_name = var.name[len(self.root.name) + 1: -2] + '-mean' + var_name = var.name[len(self.root.name) + 1: -2] + '-variance' + else: + mean_name = var.name[:-2] + '-mean' + var_name = var.name[:-2] + '-variance' + mean, variance = tf.nn.moments(x=var, axes=list(range(tf_util.rank(x=var)))) + dependencies.append( + tf.summary.scalar(name=mean_name, data=mean, step=self.root.updates) + ) + dependencies.append( + tf.summary.scalar(name=var_name, data=variance, step=self.root.updates) + ) + + with tf.control_dependencies(control_inputs=dependencies): + return tf_util.identity(input=tf_util.constant(value=True, dtype='bool')) diff --git a/tensorforce/core/optimizers/optimizer_wrapper.py b/tensorforce/core/optimizers/optimizer_wrapper.py new file mode 100644 index 000000000..85edd36f3 --- /dev/null +++ b/tensorforce/core/optimizers/optimizer_wrapper.py @@ -0,0 +1,95 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import TensorforceError +from tensorforce.core import tf_function +from tensorforce.core.optimizers import UpdateModifier + + +class OptimizerWrapper(UpdateModifier): + """ + Optimizer wrapper, which performs additional update modifications, argument order indicates + modifier nesting from outside to inside + (specification key: `optimizer_wrapper`). + + Args: + optimizer (specification): Optimizer + (required). + learning_rate (parameter, float > 0.0): Learning rate + (default: 1e-3). + clipping_threshold (parameter, float > 0.0): Clipping threshold + (default: no clipping). + multi_step (parameter, int >= 1): Number of optimization steps + (default: single step). + subsampling_fraction (parameter, int > 0 | 0.0 < float <= 1.0): Absolute/relative fraction + of batch timesteps to subsample, update_frequency * multi_step should be at least 1 if + relative subsampling_fraction + (default: no subsampling). + linesearch_iterations (parameter, int >= 0): Maximum number of line search iterations, using + a backtracking factor of 0.75 + (default: no line search). + doublecheck_update (bool): Check whether update has decreased loss and otherwise reverse it + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + def __init__( + self, optimizer, *, learning_rate=1e-3, clipping_threshold=None, multi_step=1, + subsampling_fraction=1.0, linesearch_iterations=0, doublecheck_update=False, name=None, + arguments_spec=None, + # Deprecated + optimizing_iterations=None, **kwargs + ): + if optimizing_iterations is not None: + raise TensorforceError.deprecated( + name='Optimizer', argument='optimizing_iterations', + replacement='linesearch_iterations' + ) + + if isinstance(optimizer, dict): + if 'learning_rate' not in optimizer: + optimizer['learning_rate'] = learning_rate + else: + optimizer = dict(type=optimizer) + optimizer['learning_rate'] = learning_rate + + optimizer.update(kwargs) + + if doublecheck_update: + optimizer = dict(type='doublecheck_step', optimizer=optimizer) + + if not isinstance(linesearch_iterations, int) or linesearch_iterations > 0: + optimizer = dict( + type='linesearch_step', optimizer=optimizer, max_iterations=linesearch_iterations + ) + + if not isinstance(subsampling_fraction, float) or subsampling_fraction != 1.0: + optimizer = dict( + type='subsampling_step', optimizer=optimizer, fraction=subsampling_fraction + ) + + if not isinstance(multi_step, int) or multi_step > 1: + optimizer = dict(type='multi_step', optimizer=optimizer, num_steps=multi_step) + + if clipping_threshold is not None: + optimizer = dict( + type='clipping_step', optimizer=optimizer, threshold=clipping_threshold + ) + + super().__init__(optimizer=optimizer, name=name, arguments_spec=arguments_spec) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): + return self.optimizer.step(arguments=arguments, variables=variables, **kwargs) diff --git a/tensorforce/core/optimizers/plus.py b/tensorforce/core/optimizers/plus.py new file mode 100644 index 000000000..33c5983b4 --- /dev/null +++ b/tensorforce/core/optimizers/plus.py @@ -0,0 +1,56 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +import tensorforce.core +from tensorforce.core import tf_function +from tensorforce.core.optimizers import Optimizer + + +class Plus(Optimizer): + """ + Additive combination of two optimizers (specification key: `plus`). + + Args: + optimizer1 (specification): First optimizer configuration + (required). + optimizer2 (specification): Second optimizer configuration + (required). + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + def __init__(self, *, optimizer1, optimizer2, name=None, arguments_spec=None): + super().__init__(name=name, arguments_spec=arguments_spec) + + self.optimizer1 = self.submodule( + name=(name + '1'), module=optimizer1, modules=tensorforce.core.optimizer_modules, + arguments_spec=self.arguments_spec + ) + self.optimizer2 = self.submodule( + name=(name + '2'), module=optimizer2, modules=tensorforce.core.optimizer_modules, + arguments_spec=self.arguments_spec + ) + + @tf_function(num_args=1) + def step(self, *, arguments, **kwargs): + deltas1 = self.optimizer1.step(arguments=arguments, **kwargs) + + with tf.control_dependencies(control_inputs=deltas1): + deltas2 = self.optimizer2.step(arguments=arguments, **kwargs) + + with tf.control_dependencies(control_inputs=deltas2): + return [delta1 + delta2 for delta1, delta2 in zip(deltas1, deltas2)] diff --git a/tensorforce/core/optimizers/solvers/__init__.py b/tensorforce/core/optimizers/solvers/__init__.py index f97142eab..41606b69c 100644 --- a/tensorforce/core/optimizers/solvers/__init__.py +++ b/tensorforce/core/optimizers/solvers/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,14 @@ # limitations under the License. # ============================================================================== - from tensorforce.core.optimizers.solvers.solver import Solver from tensorforce.core.optimizers.solvers.iterative import Iterative + from tensorforce.core.optimizers.solvers.conjugate_gradient import ConjugateGradient from tensorforce.core.optimizers.solvers.line_search import LineSearch -solvers = dict( - conjugate_gradient=ConjugateGradient, - line_search=LineSearch -) +solver_modules = dict(conjugate_gradient=ConjugateGradient, line_search=LineSearch) -__all__ = ['solvers', 'Solver', 'Iterative', 'ConjugateGradient', 'LineSearch'] +__all__ = ['ConjugateGradient', 'Iterative', 'LineSearch', 'Solver', 'solver_modules'] diff --git a/tensorforce/core/optimizers/solvers/conjugate_gradient.py b/tensorforce/core/optimizers/solvers/conjugate_gradient.py index d8c5651c5..dcfda7b60 100644 --- a/tensorforce/core/optimizers/solvers/conjugate_gradient.py +++ b/tensorforce/core/optimizers/solvers/conjugate_gradient.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,12 @@ # limitations under the License. # ============================================================================== +import functools + import tensorflow as tf from tensorforce import util +from tensorforce.core import parameter_modules, SignatureDict, TensorSpec, tf_function, tf_util from tensorforce.core.optimizers.solvers import Iterative @@ -51,68 +54,120 @@ def conjgrad(A, b, x_0): """ - def __init__(self, max_iterations, damping, unroll_loop=False): + def __init__(self, *, name, max_iterations, damping): """ Creates a new conjugate gradient solver instance. Args: - max_iterations: Maximum number of iterations before termination. - damping: Damping factor. - unroll_loop: Unrolls the TensorFlow while loop if true. + max_iterations (parameter, int >= 1): Maximum number of iterations before termination. + damping (parameter, 0.0 <= float <= 1.0): Damping factor. """ - assert damping >= 0.0 - self.damping = damping + super().__init__(name=name, max_iterations=max_iterations) - super(ConjugateGradient, self).__init__(max_iterations=max_iterations, unroll_loop=unroll_loop) + self.damping = self.submodule( + name='damping', module=damping, modules=parameter_modules, dtype='float', min_value=0.0, + max_value=1.0 + ) - def tf_solve(self, fn_x, x_init, b): + def input_signature(self, *, function): + if function == 'end' or function == 'next_step' or function == 'step': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x=self.values_spec.signature(batched=False), + conjugate=self.values_spec.signature(batched=False), + residual=self.values_spec.signature(batched=False), + squared_residual=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + elif function == 'solve' or function == 'start': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x_init=self.values_spec.signature(batched=False), + b=self.values_spec.signature(batched=False) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'end' or function == 'solve': + return SignatureDict(singleton=self.values_spec.signature(batched=False)) + + elif function == 'next_step': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'start' or function == 'step': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x=self.values_spec.signature(batched=False), + conjugate=self.values_spec.signature(batched=False), + residual=self.values_spec.signature(batched=False), + squared_residual=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=3) + def solve(self, *, arguments, x_init, b, fn_x): """ Iteratively solves the system of linear equations $A x = b$. Args: - fn_x: A callable returning the left-hand side $A x$ of the system of linear equations. + arguments: ??? x_init: Initial solution guess $x_0$, zero vector if None. b: The right-hand side $b$ of the system of linear equations. + fn_x: A callable returning the left-hand side $A x$ of the system of linear equations. Returns: A solution $x$ to the problem as given by the solver. """ - return super(ConjugateGradient, self).tf_solve(fn_x, x_init, b) + return super().solve(arguments=arguments, x_init=x_init, b=b, fn_x=fn_x) - def tf_initialize(self, x_init, b): + @tf_function(num_args=3) + def start(self, *, arguments, x_init, b): """ - Initialization step preparing the arguments for the first iteration of the loop body: + Initialization step preparing the arguments for the first iteration of the loop body: $x_0, 0, p_0, r_0, r_0^2$. Args: + arguments: ??? x_init: Initial solution guess $x_0$, zero vector if None. b: The right-hand side $b$ of the system of linear equations. Returns: - Initial arguments for tf_step. + Initial arguments for step. """ - if x_init is None: - # Initial guess is zero vector if not given. - x_init = [tf.zeros(shape=util.shape(t)) for t in b] - - initial_args = super(ConjugateGradient, self).tf_initialize(x_init) - # r_0 := b - A * x_0 # c_0 := r_0 - conjugate = residual = [t - fx for t, fx in zip(b, self.fn_x(x_init))] + fx = self.fn_x(arguments, x_init) + subtract = functools.partial( + tf_util.lift_indexedslices, tf.math.subtract, + with_assertions=self.config.create_tf_assertions + ) + conjugate = residual = b.fmap(function=subtract, zip_values=fx) # r_0^2 := r^T * r - squared_residual = tf.add_n(inputs=[tf.reduce_sum(input_tensor=(res * res)) for res in residual]) + multiply = functools.partial( + tf_util.lift_indexedslices, tf.math.multiply, + with_assertions=self.config.create_tf_assertions + ) + squared_residual = tf.math.add_n(inputs=[ + tf.math.reduce_sum(input_tensor=multiply(res, res)) for res in residual.values() + ]) - return initial_args + (conjugate, residual, squared_residual) + return arguments, x_init, conjugate, residual, squared_residual - def tf_step(self, x, iteration, conjugate, residual, squared_residual): + @tf_function(num_args=5, is_loop_body=True) + def step(self, *, arguments, x, conjugate, residual, squared_residual): """ Iteration loop body of the conjugate gradient algorithm. Args: + arguments: ??? x: Current solution estimate $x_t$. - iteration: Current iteration counter $t$. conjugate: Current conjugate $c_t$. residual: Current residual $r_t$. squared_residual: Current squared residual $r_t^2$. @@ -120,49 +175,72 @@ def tf_step(self, x, iteration, conjugate, residual, squared_residual): Returns: Updated arguments for next iteration. """ - x, next_iteration, conjugate, residual, squared_residual = super(ConjugateGradient, self).tf_step( - x, iteration, conjugate, residual, squared_residual - ) # Ac := A * c_t - A_conjugate = self.fn_x(conjugate) + A_conjugate = self.fn_x(arguments, conjugate) # TODO: reference? - if self.damping > 0.0: - A_conjugate = [A_conj + self.damping * conj for A_conj, conj in zip(A_conjugate, conjugate)] + damping = self.damping.value() + + def no_damping(): + return A_conjugate + + def apply_damping(): + return A_conjugate.fmap( + function=(lambda A_conj, conj: A_conj + damping * conj), zip_values=conjugate + ) + + zero = tf_util.constant(value=0.0, dtype='float') + skip_damping = tf.math.equal(x=damping, y=zero) + A_conjugate = tf.cond(pred=skip_damping, true_fn=no_damping, false_fn=apply_damping) # cAc := c_t^T * Ac - conjugate_A_conjugate = tf.add_n( - inputs=[tf.reduce_sum(input_tensor=(conj * A_conj)) for conj, A_conj in zip(conjugate, A_conjugate)] - ) + multiply = tf.math.multiply + conjugate_A_conjugate = conjugate.fmap(function=multiply, zip_values=A_conjugate) + conjugate_A_conjugate = tf.math.add_n(inputs=[ + tf.math.reduce_sum(input_tensor=conj_A_conj) + for conj_A_conj in conjugate_A_conjugate.values() + ]) # \alpha := r_t^2 / cAc - alpha = squared_residual / tf.maximum(x=conjugate_A_conjugate, y=util.epsilon) + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + conjugate_A_conjugate = tf.where( + condition=(conjugate_A_conjugate > 0.0), + x=(conjugate_A_conjugate + epsilon), y=(conjugate_A_conjugate - epsilon) + ) + alpha = squared_residual / conjugate_A_conjugate # x_{t+1} := x_t + \alpha * c_t - next_x = [t + alpha * conj for t, conj in zip(x, conjugate)] + next_x = x.fmap(function=(lambda t, conj: t + alpha * conj), zip_values=conjugate) # r_{t+1} := r_t - \alpha * Ac - next_residual = [res - alpha * A_conj for res, A_conj in zip(residual, A_conjugate)] + next_residual = residual.fmap( + function=(lambda res, A_conj: res - alpha * A_conj), zip_values=A_conjugate + ) # r_{t+1}^2 := r_{t+1}^T * r_{t+1} - next_squared_residual = tf.add_n(inputs=[tf.reduce_sum(input_tensor=(res * res)) for res in next_residual]) + next_squared_residual = tf.math.add_n( + inputs=[tf.math.reduce_sum(input_tensor=(res * res)) for res in next_residual.values()] + ) # \beta = r_{t+1}^2 / r_t^2 - beta = next_squared_residual / tf.maximum(x=squared_residual, y=util.epsilon) + beta = next_squared_residual / (squared_residual + epsilon) # c_{t+1} := r_{t+1} + \beta * c_t - next_conjugate = [res + beta * conj for res, conj in zip(next_residual, conjugate)] + next_conjugate = next_residual.fmap( + function=(lambda res, conj: res + beta * conj), zip_values=conjugate + ) - return next_x, next_iteration, next_conjugate, next_residual, next_squared_residual + return arguments, next_x, next_conjugate, next_residual, next_squared_residual - def tf_next_step(self, x, iteration, conjugate, residual, squared_residual): + @tf_function(num_args=5) + def next_step(self, *, arguments, x, conjugate, residual, squared_residual): """ Termination condition: max number of iterations, or residual sufficiently small. Args: + arguments: ??? x: Current solution estimate $x_t$. - iteration: Current iteration counter $t$. conjugate: Current conjugate $c_t$. residual: Current residual $r_t$. squared_residual: Current squared residual $r_t^2$. @@ -170,5 +248,6 @@ def tf_next_step(self, x, iteration, conjugate, residual, squared_residual): Returns: True if another iteration should be performed. """ - next_step = super(ConjugateGradient, self).tf_next_step(x, iteration, conjugate, residual, squared_residual) - return tf.logical_and(x=next_step, y=(squared_residual >= util.epsilon)) + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + + return squared_residual >= epsilon diff --git a/tensorforce/core/optimizers/solvers/iterative.py b/tensorforce/core/optimizers/solvers/iterative.py index c6d3d8bcf..73cdcece9 100644 --- a/tensorforce/core/optimizers/solvers/iterative.py +++ b/tensorforce/core/optimizers/solvers/iterative.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import tensorflow as tf -from tensorforce import util +from tensorforce.core import parameter_modules, tf_util from tensorforce.core.optimizers.solvers import Solver @@ -25,35 +25,33 @@ class Iterative(Solver): initialization step, the iteration loop body and the termination condition. """ - def __init__(self, max_iterations, unroll_loop=False): + def __init__(self, *, name, max_iterations): """ Creates a new iterative solver instance. Args: - max_iterations: Maximum number of iterations before termination. - unroll_loop: Unrolls the TensorFlow while loop if true. + max_iterations (parameter, int >= 1): Maximum number of iterations before termination. """ - assert max_iterations >= 0 - self.max_iterations = max_iterations + super().__init__(name=name) - assert isinstance(unroll_loop, bool) - self.unroll_loop = unroll_loop + self.max_iterations = self.submodule( + name='max_iterations', module=max_iterations, modules=parameter_modules, + dtype='int', min_value=1 + ) - super(Iterative, self).__init__() + def complete_initialize(self, arguments_spec, values_spec): + self.arguments_spec = arguments_spec + self.values_spec = values_spec - # TensorFlow functions - self.initialize = tf.make_template(name_='initialize', func_=self.tf_initialize) - self.step = tf.make_template(name_='step', func_=self.tf_step) - self.next_step = tf.make_template(name_='next-step', func_=self.tf_next_step) - - def tf_solve(self, fn_x, x_init, *args): + def solve(self, *, arguments, x_init, fn_x=None, **kwargs): """ Iteratively solves an equation/optimization for $x$ involving an expression $f(x)$. Args: - fn_x: A callable returning an expression $f(x)$ given $x$. + arguments: ??? x_init: Initial solution guess $x_0$. - *args: Additional solver-specific arguments. + fn_x: A callable returning an expression $f(x)$ given $x$. + **values: Additional solver-specific arguments. Returns: A solution $x$ to the problem as given by the solver. @@ -61,64 +59,73 @@ def tf_solve(self, fn_x, x_init, *args): self.fn_x = fn_x # Initialization step - args = self.initialize(x_init, *args) - # args = util.map_tensors(fn=tf.stop_gradient, tensors=args) + values = self.start(arguments=arguments, x_init=x_init, **kwargs) # Iteration loop with termination condition - if self.unroll_loop: - # Unrolled for loop - for _ in range(self.max_iterations): - next_step = self.next_step(*args) - step = (lambda: self.step(*args)) - do_nothing = (lambda: args) - args = tf.cond(pred=next_step, true_fn=step, false_fn=do_nothing) - - else: - # TensorFlow while loop - args = tf.while_loop(cond=self.next_step, body=self.step, loop_vars=args) - - # First argument contains solution - return args[0] - - def tf_initialize(self, x_init, *args): + max_iterations = self.max_iterations.value() + signature = self.input_signature(function='step') + values = signature.kwargs_to_args(kwargs=values) + values = tf.while_loop( + cond=self.next_step, body=self.step, loop_vars=tuple(values), + maximum_iterations=tf_util.int32(x=max_iterations) + ) + values = signature.args_to_kwargs(args=values) + solution = self.end(**values.to_kwargs()) + + return solution + + def start(self, *, arguments, x_init, **kwargs): """ - Initialization step preparing the arguments for the first iteration of the loop body - (default: initial solution guess and iteration counter). + Initialization step preparing the arguments for the first iteration of the loop body. Args: + arguments: ??? x_init: Initial solution guess $x_0$. *args: Additional solver-specific arguments. Returns: - Initial arguments for tf_step. + Initial arguments for step. """ - return x_init, 0 + return (arguments, x_init) + tuple(kwargs.values()) - def tf_step(self, x, iteration, *args): + def step(self, *, arguments, x, **kwargs): """ - Iteration loop body of the iterative solver (default: increment iteration step). The - first two loop arguments have to be the current solution estimate and the iteration step. + Iteration loop body of the iterative solver. Args: + arguments: ??? x: Current solution estimate. - iteration: Current iteration counter. *args: Additional solver-specific arguments. Returns: Updated arguments for next iteration. """ - return (x, iteration + 1) + args + raise NotImplementedError - def tf_next_step(self, x, iteration, *args): + def next_step(self, *, arguments, x, **kwargs): """ Termination condition (default: max number of iterations). Args: + arguments: ??? x: Current solution estimate. - iteration: Current iteration counter. *args: Additional solver-specific arguments. Returns: True if another iteration should be performed. """ - return iteration < self.max_iterations + return tf_util.constant(value=True, dtype='bool') + + def end(self, *, arguments, x, **kwargs): + """ + Termination step preparing the return value. + + Args: + arguments: ??? + x: Final solution estimate. + *args: Additional solver-specific arguments. + + Returns: + Final solution. + """ + return x diff --git a/tensorforce/core/optimizers/solvers/line_search.py b/tensorforce/core/optimizers/solvers/line_search.py index 8eb951ed1..9e9c7196c 100644 --- a/tensorforce/core/optimizers/solvers/line_search.py +++ b/tensorforce/core/optimizers/solvers/line_search.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,163 +15,195 @@ import tensorflow as tf -from tensorforce import util, TensorForceError +from tensorforce import util +from tensorforce.core import parameter_modules, SignatureDict, TensorSpec, tf_function, tf_util from tensorforce.core.optimizers.solvers import Iterative class LineSearch(Iterative): """ - Line search algorithm which iteratively optimizes the value $f(x)$ for $x$ on the line between - $x'$ and $x_0$ by optimistically taking the first acceptable $x$ starting from $x_0$ and + Line search algorithm which iteratively optimizes the value $f(x)$ for $x$ on the line between + $x'$ and $x_0$ by optimistically taking the first acceptable $x$ starting from $x_0$ and moving towards $x'$. """ - def __init__(self, max_iterations, accept_ratio, mode, parameter, unroll_loop=False): + def __init__(self, *, name, max_iterations, backtracking_factor): """ - Creates a new line search solver instance. + Create a new line search solver instance. Args: - max_iterations: Maximum number of iterations before termination. - accept_ratio: Lower limit of what improvement ratio over $x = x'$ is acceptable - (based either on a given estimated improvement or with respect to the value at - $x = x'$). - mode: Mode of movement between $x_0$ and $x'$, either 'linear' or 'exponential'. - parameter: Movement mode parameter, additive or multiplicative, respectively. - unroll_loop: Unrolls the TensorFlow while loop if true. + max_iterations (parameter, int >= 1): Maximum number of iterations before termination. + backtracking_factor (parameter, 0.0 < float < 1.0): Backtracking factor. """ - assert accept_ratio >= 0.0 - self.accept_ratio = accept_ratio + super().__init__(name=name, max_iterations=max_iterations) - # TODO: Implement such sequences more generally, also useful for learning rate decay or so. - if mode not in ('linear', 'exponential'): - raise TensorForceError("Invalid line search mode: {}, please choose one of'linear' or 'exponential'".format(mode)) - self.mode = mode - self.parameter = parameter - - super(LineSearch, self).__init__(max_iterations=max_iterations, unroll_loop=unroll_loop) + self.backtracking_factor = self.submodule( + name='backtracking_factor', module=backtracking_factor, modules=parameter_modules, + dtype='float', min_value=0.0, max_value=1.0 + ) - def tf_solve(self, fn_x, x_init, base_value, target_value, estimated_improvement=None): + def input_signature(self, *, function): + if function == 'end' or function == 'next_step' or function == 'step': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x=self.values_spec.signature(batched=False), + deltas=self.values_spec.signature(batched=False), + improvement=TensorSpec(type='float', shape=()).signature(batched=False), + last_improvement=TensorSpec(type='float', shape=()).signature(batched=False), + base_value=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + elif function == 'solve' or function == 'start': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x_init=self.values_spec.signature(batched=False), + base_value=TensorSpec(type='float', shape=()).signature(batched=False), + zero_value=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'end' or function == 'solve': + return SignatureDict(singleton=self.values_spec.signature(batched=False)) + + elif function == 'next_step': + return SignatureDict( + singleton=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'start' or function == 'step': + return SignatureDict( + arguments=self.arguments_spec.signature(batched=True), + x=self.values_spec.signature(batched=False), + deltas=self.values_spec.signature(batched=False), + improvement=TensorSpec(type='float', shape=()).signature(batched=False), + last_improvement=TensorSpec(type='float', shape=()).signature(batched=False), + base_value=TensorSpec(type='float', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=4) + def solve(self, *, arguments, x_init, base_value, zero_value, fn_x): """ - Iteratively optimizes $f(x)$ for $x$ on the line between $x'$ and $x_0$. + Iteratively optimize $f(x)$ for $x$ on the line between $x'$ and $x_0$. Args: - fn_x: A callable returning the value $f(x)$ at $x$. x_init: Initial solution guess $x_0$. base_value: Value $f(x')$ at $x = x'$. - target_value: Value $f(x_0)$ at $x = x_0$. - estimated_improvement: Estimated improvement for $x = x_0$, $f(x')$ if None. + zero_value: Value $f(x_0)$ at $x = x_0$. + fn_x: A callable returning the value $f(x)$ at $x$, with potential side effect. Returns: A solution $x$ to the problem as given by the solver. """ - return super(LineSearch, self).tf_solve(fn_x, x_init, base_value, target_value, estimated_improvement) + return super().solve( + arguments=arguments, x_init=x_init, base_value=base_value, zero_value=zero_value, + fn_x=fn_x + ) - def tf_initialize(self, x_init, base_value, target_value, estimated_improvement): + @tf_function(num_args=4) + def start(self, *, arguments, x_init, base_value, zero_value): """ Initialization step preparing the arguments for the first iteration of the loop body. Args: x_init: Initial solution guess $x_0$. base_value: Value $f(x')$ at $x = x'$. - target_value: Value $f(x_0)$ at $x = x_0$. - estimated_improvement: Estimated value at $x = x_0$, $f(x')$ if None. + zero_value: Value $f(x_0)$ at $x = x_0$. Returns: - Initial arguments for tf_step. + Initial arguments for step. """ - self.base_value = base_value - - if estimated_improvement is None: # TODO: Is this a good alternative? - estimated_improvement = tf.abs(x=base_value) - - first_step = super(LineSearch, self).tf_initialize(x_init) - - improvement = tf.divide( - x=(target_value - self.base_value), - y=tf.maximum(x=estimated_improvement, y=util.epsilon) - ) + one_float = tf_util.constant(value=1.0, dtype='float') + backtracking_factor = self.backtracking_factor.value() + deltas = x_init.fmap(function=(lambda t: t * (backtracking_factor - one_float))) - last_improvement = improvement - 1.0 + last_improvement = base_value - zero_value - if self.mode == 'linear': - deltas = [-t * self.parameter for t in x_init] - self.estimated_incr = -estimated_improvement * self.parameter + target_value = self.fn_x(arguments, deltas) + improvement = base_value - target_value - elif self.mode == 'exponential': - deltas = [-t * self.parameter for t in x_init] + return arguments, x_init, deltas, improvement, last_improvement, base_value - return first_step + (deltas, improvement, last_improvement, estimated_improvement) - - def tf_step(self, x, iteration, deltas, improvement, last_improvement, estimated_improvement): + @tf_function(num_args=6, is_loop_body=True) + def step(self, *, arguments, x, deltas, improvement, last_improvement, base_value): """ Iteration loop body of the line search algorithm. Args: - x: Current solution estimate $x_t$. - iteration: Current iteration counter $t$. - deltas: Current difference $x_t - x'$. - improvement: Current improvement $(f(x_t) - f(x')) / v'$. - last_improvement: Last improvement $(f(x_{t-1}) - f(x')) / v'$. - estimated_improvement: Current estimated value $v'$. + x: Current solution estimate $x_{t-1}$. + deltas: Current difference $x_t - x_{t-1}$. + improvement: Current improvement $(f(x') - f(x_t))$. + last_improvement: Last improvement $(f(x') - f(x_{t-1}))$. + base_value: Value $f(x')$ at $x = x'$. Returns: Updated arguments for next iteration. """ - x, next_iteration, deltas, improvement, last_improvement, estimated_improvement = super(LineSearch, self).tf_step( - x, iteration, deltas, improvement, last_improvement, estimated_improvement - ) + next_x = x.fmap(function=(lambda t, delta: t + delta), zip_values=deltas) - next_x = [t + delta for t, delta in zip(x, deltas)] + backtracking_factor = self.backtracking_factor.value() + next_deltas = deltas.fmap(function=(lambda delta: delta * backtracking_factor)) - if self.mode == 'linear': - next_deltas = deltas - next_estimated_improvement = estimated_improvement + self.estimated_incr + target_value = self.fn_x(arguments, next_deltas) + next_improvement = base_value - target_value - elif self.mode == 'exponential': - next_deltas = [delta * self.parameter for delta in deltas] - next_estimated_improvement = estimated_improvement * self.parameter + return arguments, next_x, next_deltas, next_improvement, improvement, base_value - target_value = self.fn_x(next_deltas) + @tf_function(num_args=6) + def next_step(self, *, arguments, x, deltas, improvement, last_improvement, base_value): + """ + Termination condition: max number of iterations, or no improvement for last step, or + improvement less than acceptable ratio, or estimated value not positive. - next_improvement = tf.divide( - x=(target_value - self.base_value), - y=tf.maximum(x=next_estimated_improvement, y=util.epsilon) - ) + Args: + x: Current solution estimate $x_{t-1}$. + deltas: Current difference $x_t - x_{t-1}$. + improvement: Current improvement $(f(x') - f(x_t))$. + last_improvement: Last improvement $(f(x') - f(x_{t-1}))$. + base_value: Value $f(x')$ at $x = x'$. - return next_x, next_iteration, next_deltas, next_improvement, improvement, next_estimated_improvement + Returns: + True if another iteration should be performed. + """ + return improvement > last_improvement - def tf_next_step(self, x, iteration, deltas, improvement, last_improvement, estimated_improvement): + @tf_function(num_args=6) + def end(self, *, arguments, x, deltas, improvement, last_improvement, base_value): """ - Termination condition: max number of iterations, or no improvement for last step, or - improvement less than acceptable ratio, or estimated value not positive. + Termination step preparing the return value. Args: - x: Current solution estimate $x_t$. - iteration: Current iteration counter $t$. - deltas: Current difference $x_t - x'$. - improvement: Current improvement $(f(x_t) - f(x')) / v'$. - last_improvement: Last improvement $(f(x_{t-1}) - f(x')) / v'$. - estimated_improvement: Current estimated value $v'$. + x: Final solution estimate $x_n$. + deltas: Current difference $x_n - x_{n-1}$. + improvement: Current improvement $(f(x') - f(x_t))$. + last_improvement: Last improvement $(f(x') - f(x_{t-1}))$. + base_value: Value $f(x')$ at $x = x'$. Returns: - True if another iteration should be performed. + Final solution. """ - next_step = super(LineSearch, self).tf_next_step( - x, iteration, deltas, improvement, last_improvement, estimated_improvement - ) - def undo_deltas(): - value = self.fn_x([-delta for delta in deltas]) - with tf.control_dependencies(control_inputs=(value,)): - # Trivial operation to enforce control dependency - return tf.less(x=value, y=value) # == False + def keep_last_step(): + return x.fmap(function=(lambda t, delta: t + delta), zip_values=deltas) - improved = tf.cond( - pred=(improvement > last_improvement), - true_fn=(lambda: True), - false_fn=undo_deltas - ) + def undo_last_step(): + target_value = self.fn_x(arguments, deltas.fmap(function=(lambda delta: -delta))) + + dependencies = [target_value] + if self.config.create_debug_assertions: + epsilon = tf_util.constant(value=1e-5, dtype='float') + epsilon = tf.math.maximum(x=epsilon, y=(epsilon * tf.math.abs(x=base_value))) + dependencies.append(tf.debugging.assert_less( + x=tf.math.abs(x=(base_value - target_value - last_improvement)), y=epsilon + )) + + with tf.control_dependencies(control_inputs=dependencies): + return x.fmap(function=tf_util.identity) - next_step = tf.logical_and(x=next_step, y=improved) - next_step = tf.logical_and(x=next_step, y=(improvement < self.accept_ratio)) - return tf.logical_and(x=next_step, y=(estimated_improvement > util.epsilon)) + accept_solution = (improvement >= last_improvement) + return tf.cond(pred=accept_solution, true_fn=keep_last_step, false_fn=undo_last_step) diff --git a/tensorforce/core/optimizers/solvers/solver.py b/tensorforce/core/optimizers/solvers/solver.py index 096012532..940170eaa 100644 --- a/tensorforce/core/optimizers/solvers/solver.py +++ b/tensorforce/core/optimizers/solvers/solver.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,44 +13,24 @@ # limitations under the License. # ============================================================================== -import tensorflow as tf -from tensorforce import util -import tensorforce.core.optimizers.solvers +from tensorforce.core import Module -class Solver(object): +class Solver(Module): """ - Generic TensorFlow-based solver which solves a not yet further specified - equation/optimization problem. + Generic TensorFlow-based solver which solves a not yet further specified equation/optimization + problem. """ - def __init__(self): - """ - Creates a new solver instance. - """ - # TensorFlow function - self.solve = tf.make_template(name_='solver', func_=self.tf_solve) - - def tf_solve(self, fn_x, *args): + def solve(self, *args, fn_x=None): """ Solves an equation/optimization for $x$ involving an expression $f(x)$. Args: - fn_x: A callable returning an expression $f(x)$ given $x$. *args: Additional solver-specific arguments. + fn_x: A callable returning an expression $f(x)$ given $x$. Returns: A solution $x$ to the problem as given by the solver. """ raise NotImplementedError - - @staticmethod - def from_config(config, kwargs=None): - """ - Creates a solver from a specification dict. - """ - return util.get_object( - obj=config, - predefined=tensorforce.core.optimizers.solvers.solvers, - kwargs=kwargs - ) diff --git a/tensorforce/core/optimizers/subsampling_step.py b/tensorforce/core/optimizers/subsampling_step.py index 6313018cf..9992e085c 100755 --- a/tensorforce/core/optimizers/subsampling_step.py +++ b/tensorforce/core/optimizers/subsampling_step.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,92 +13,82 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import tensorflow as tf -from tensorforce import util, TensorForceError -from tensorforce.core.optimizers import MetaOptimizer +from tensorforce.core import parameter_modules, tf_function, tf_util +from tensorforce.core.optimizers import UpdateModifier +from tensorforce.core.utils import TensorDict -class SubsamplingStep(MetaOptimizer): +class SubsamplingStep(UpdateModifier): """ - The subsampling-step meta optimizer randomly samples a subset of batch instances to calculate - the optimization step of another optimizer. + Subsampling-step update modifier, which randomly samples a subset of batch instances before + applying the given optimizer (specification key: `subsampling_step`). + + Args: + optimizer (specification): Optimizer configuration + (required). + fraction (parameter, int > 0 | 0.0 < float <= 1.0): Absolute/relative fraction of batch + timesteps to subsample (required). + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, optimizer, fraction=0.1, scope='subsampling-step', summary_labels=()): - """ - Creates a new subsampling-step meta optimizer instance. + def __init__(self, *, optimizer, fraction, name=None, arguments_spec=None): + super().__init__(optimizer=optimizer, name=name, arguments_spec=arguments_spec) - Args: - optimizer: The optimizer which is modified by this meta optimizer. - fraction: The fraction of instances of the batch to subsample. - """ - assert isinstance(fraction, float) and fraction > 0.0 - self.fraction = fraction + if isinstance(fraction, int): + self.is_fraction_absolute = True + self.fraction = self.submodule( + name='fraction', module=fraction, modules=parameter_modules, dtype='int', + min_value=1 + ) + else: + self.is_fraction_absolute = False + self.fraction = self.submodule( + name='fraction', module=fraction, modules=parameter_modules, dtype='float', + min_value=0.0, max_value=1.0 + ) - super(SubsamplingStep, self).__init__(optimizer=optimizer, scope=scope, summary_labels=summary_labels) + @tf_function(num_args=1) + def step(self, *, arguments, **kwargs): + if not self.is_fraction_absolute and self.fraction.is_constant(value=1.0): + return self.optimizer.step(arguments=arguments, **kwargs) - def tf_step( - self, - time, - variables, - arguments, - **kwargs - ): - """ - Creates the TensorFlow operations for performing an optimization step. + batch_size = tf_util.cast(x=tf.shape(input=arguments['reward'])[0], dtype='int') + if self.is_fraction_absolute: + fraction = self.fraction.is_constant() + if fraction is None: + fraction = self.fraction.value() + else: + fraction = self.fraction.value() * tf_util.cast(x=batch_size, dtype='float') + fraction = tf_util.cast(x=fraction, dtype='int') + one = tf_util.constant(value=1, dtype='int') + fraction = tf.math.maximum(x=fraction, y=one) - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - **kwargs: Additional arguments passed on to the internal optimizer. + def subsampled_step(): + subsampled_arguments = TensorDict() + indices = tf.random.uniform( + shape=(fraction,), maxval=batch_size, dtype=tf_util.get_dtype(type='int') + ) - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - # Get some (batched) argument to determine batch size. - arguments_iter = iter(arguments.values()) - some_argument = next(arguments_iter) + if 'states' in arguments and 'horizons' in arguments: + horizons = tf.gather(params=arguments['horizons'], indices=indices) + starts = horizons[:, 0] + lengths = horizons[:, 1] + states_indices = tf.ragged.range(starts=starts, limits=(starts + lengths)).values + function = (lambda x: tf.gather(params=x, indices=states_indices)) + subsampled_arguments['states'] = arguments['states'].fmap(function=function) + starts = tf.math.cumsum(x=lengths, exclusive=True) + subsampled_arguments['horizons'] = tf.stack(values=(starts, lengths), axis=1) - try: - while not isinstance(some_argument, tf.Tensor) or util.rank(some_argument) == 0: - if isinstance(some_argument, dict): - if some_argument: - arguments_iter = iter(some_argument.values()) - some_argument = next(arguments_iter) - elif isinstance(some_argument, list): - if some_argument: - arguments_iter = iter(some_argument) - some_argument = next(arguments_iter) - elif some_argument is None or util.rank(some_argument) == 0: - # Non-batched argument - some_argument = next(arguments_iter) - else: - raise TensorForceError("Invalid argument type.") - except StopIteration: - raise TensorForceError("Invalid argument type.") + for name, argument in arguments.items(): + if name not in subsampled_arguments: + subsampled_arguments[name] = tf.gather(params=argument, indices=indices) - batch_size = tf.shape(input=some_argument)[0] - num_samples = tf.cast( - x=(self.fraction * tf.cast(x=batch_size, dtype=util.tf_dtype('float'))), - dtype=util.tf_dtype('int') - ) - num_samples = tf.maximum(x=num_samples, y=1) - indices = tf.random_uniform(shape=(num_samples,), maxval=batch_size, dtype=tf.int32) + return self.optimizer.step(arguments=subsampled_arguments, **kwargs) - subsampled_arguments = util.map_tensors( - fn=(lambda arg: arg if util.rank(arg) == 0 else tf.gather(params=arg, indices=indices)), - tensors=arguments - ) + def normal_step(): + return self.optimizer.step(arguments=arguments, **kwargs) - return self.optimizer.step( - time=time, - variables=variables, - arguments=subsampled_arguments, - **kwargs - ) + return tf.cond(pred=(fraction < batch_size), true_fn=subsampled_step, false_fn=normal_step) diff --git a/tensorforce/core/optimizers/synchronization.py b/tensorforce/core/optimizers/synchronization.py index e66214fff..a367d6963 100755 --- a/tensorforce/core/optimizers/synchronization.py +++ b/tensorforce/core/optimizers/synchronization.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,81 +13,102 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import tensorflow as tf -from tensorforce import util +from tensorforce.core import parameter_modules, TensorSpec, tf_function, tf_util from tensorforce.core.optimizers import Optimizer class Synchronization(Optimizer): """ - The synchronization optimizer updates variables periodically to the value of a corresponding - set of source variables. + Synchronization optimizer, which updates variables periodically to the value of a corresponding + set of source variables (specification key: `synchronization`). + + Args: + optimizer (specification): Optimizer configuration + (required). + update_weight (parameter, 0.0 < float <= 1.0): Update weight + (required). + sync_frequency (parameter, int >= 1): Interval between updates which also perform a + synchronization step (default: every update). + name (string): (internal use). + arguments_spec (specification): internal use. """ - def __init__(self, sync_frequency=1, update_weight=1.0, scope='synchronization', summary_labels=()): - """ - Creates a new synchronization optimizer instance. - - Args: - sync_frequency: The interval between optimization calls actually performing a - synchronization step. - update_weight: The update weight, 1.0 meaning a full assignment of the source - variables values. - """ - assert isinstance(sync_frequency, int) and sync_frequency > 0 - self.sync_frequency = sync_frequency - - assert isinstance(update_weight, float) and update_weight > 0.0 - self.update_weight = update_weight - - super(Synchronization, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_step(self, time, variables, source_variables, **kwargs): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - source_variables: List of source variables to synchronize with. - **kwargs: Additional arguments, not used. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - assert all(util.shape(source) == util.shape(target) for source, target in zip(source_variables, variables)) - - last_sync = tf.get_variable( - name='last-sync', - dtype=tf.int32, - initializer=(-self.sync_frequency), - trainable=False + def __init__(self, *, update_weight, sync_frequency=None, name=None, arguments_spec=None): + super().__init__(name=name, arguments_spec=arguments_spec) + + self.update_weight = self.submodule( + name='update_weight', module=update_weight, modules=parameter_modules, dtype='float', + min_value=0.0, max_value=1.0 ) - def sync(): - deltas = list() - for source_variable, target_variable in zip(source_variables, variables): - delta = self.update_weight * (source_variable - target_variable) - deltas.append(delta) + if sync_frequency is None: + sync_frequency = 1 + self.sync_frequency = self.submodule( + name='sync_frequency', module=sync_frequency, modules=parameter_modules, + dtype='int', min_value=1 + ) + + def initialize(self): + super().initialize() + + if not self.sync_frequency.is_constant(value=1): + self.next_sync = self.variable( + name='next-sync', spec=TensorSpec(type='int'), initializer='zeros', + is_trainable=False, is_saved=True + ) - applied = self.apply_step(variables=variables, deltas=deltas) - last_sync_updated = last_sync.assign(value=time) + @tf_function(num_args=1) + def step(self, *, arguments, variables, **kwargs): + assert 'source_variables' in kwargs + source_variables = kwargs['source_variables'] - with tf.control_dependencies(control_inputs=(applied, last_sync_updated)): + assert all( + tf_util.shape(x=source) == tf_util.shape(x=target) + for source, target in zip(source_variables, variables) + ) + + one = tf_util.constant(value=1, dtype='int') + + def apply_sync(): + dependencies = list() + if not self.sync_frequency.is_constant(value=1): + dependencies.append(self.next_sync.assign( + value=self.sync_frequency.value(), read_value=False + )) + + with tf.control_dependencies(control_inputs=dependencies): + deltas = list() + assignments = list() + if self.update_weight.is_constant(value=1.0): + for source_var, target_var in zip(source_variables, variables): + deltas.append(source_var - target_var) + assignments.append(target_var.assign(value=source_var, read_value=False)) + else: + update_weight = self.update_weight.value() + for source_var, target_var in zip(source_variables, variables): + delta = update_weight * (source_var - target_var) + deltas.append(delta) + assignments.append(target_var.assign_add(delta=delta, read_value=False)) + + with tf.control_dependencies(control_inputs=assignments): # Trivial operation to enforce control dependency - return [delta + 0.0 for delta in deltas] + return [tf_util.identity(input=delta) for delta in deltas] def no_sync(): - deltas = list() - for variable in variables: - delta = tf.zeros(shape=util.shape(variable)) - deltas.append(delta) - return deltas - - do_sync = (time - last_sync >= self.sync_frequency) - return tf.cond(pred=do_sync, true_fn=sync, false_fn=no_sync) + next_sync_updated = self.next_sync.assign_sub(delta=one, read_value=False) + + with tf.control_dependencies(control_inputs=(next_sync_updated,)): + deltas = list() + for variable in variables: + delta = tf_util.zeros(shape=tf_util.shape(x=variable), dtype='float') + deltas.append(delta) + return deltas + + if self.sync_frequency.is_constant(value=1): + return apply_sync() + + else: + skip_sync = tf.math.greater(x=self.next_sync, y=one) + return tf.cond(pred=skip_sync, true_fn=no_sync, false_fn=apply_sync) diff --git a/tensorforce/core/optimizers/tf_optimizer.py b/tensorforce/core/optimizers/tf_optimizer.py index 4b2e27b74..a05681499 100755 --- a/tensorforce/core/optimizers/tf_optimizer.py +++ b/tensorforce/core/optimizers/tf_optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,108 +13,183 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +from functools import partial import tensorflow as tf +from tensorforce.core import parameter_modules, tf_function, tf_util from tensorforce.core.optimizers import Optimizer +tensorflow_optimizers = dict( + adadelta=tf.keras.optimizers.legacy.Adadelta, + adagrad=tf.keras.optimizers.legacy.Adagrad, + adam=tf.keras.optimizers.legacy.Adam, + adamax=tf.keras.optimizers.legacy.Adamax, + ftrl=tf.keras.optimizers.legacy.Ftrl, + nadam=tf.keras.optimizers.legacy.Nadam, + rmsprop=tf.keras.optimizers.legacy.RMSprop, + sgd=tf.keras.optimizers.legacy.SGD +) + + +try: + import tensorflow_addons as tfa + + tensorflow_optimizers['adamw'] = tfa.optimizers.AdamW + tensorflow_optimizers['lazyadam'] = tfa.optimizers.LazyAdam + tensorflow_optimizers['radam'] = tfa.optimizers.RectifiedAdam + tensorflow_optimizers['ranger'] = (lambda **kwargs: tfa.optimizers.Lookahead( + optimizer=tfa.optimizers.RectifiedAdam(**kwargs), name=kwargs['name'] + )) + tensorflow_optimizers['sgdw'] = tfa.optimizers.SGDW +except ModuleNotFoundError: + pass + + class TFOptimizer(Optimizer): """ - Wrapper class for TensorFlow optimizers. + TensorFlow optimizer (specification key: `tf_optimizer`, `adadelta`, `adagrad`, `adam`, + `adamax`, `adamw`, `ftrl`, `lazyadam`, `nadam`, `radam`, `ranger`, `rmsprop`, `sgd`, `sgdw`) + + Args: + optimizer (`adadelta` | `adagrad` | `adam` | `adamax` | `adamw` | `ftrl` | `lazyadam` | `nadam` | `radam` | `ranger` | `rmsprop` | `sgd` | `sgdw`): + TensorFlow optimizer name, see + `TensorFlow docs `__ + and `TensorFlow Addons docs + `__ + (required unless given by specification key). + learning_rate (parameter, float > 0.0): Learning rate + (required). + gradient_norm_clipping (parameter, float > 0.0): Clip gradients by the ratio of the sum + of their norms (default: 1.0). + name (string): (internal use). + arguments_spec (specification): internal use. + kwargs: Arguments for the TensorFlow optimizer, special values "decoupled_weight_decay", + "lookahead" and "moving_average", see + `TensorFlow docs `__ + and `TensorFlow Addons docs + `__. """ - tf_optimizers = dict( - adadelta=tf.train.AdadeltaOptimizer, - adagrad=tf.train.AdagradOptimizer, - adam=tf.train.AdamOptimizer, - nadam=tf.contrib.opt.NadamOptimizer, - gradient_descent=tf.train.GradientDescentOptimizer, - momentum=tf.train.MomentumOptimizer, - rmsprop=tf.train.RMSPropOptimizer - ) - - @staticmethod - def get_wrapper(optimizer): - """ - Returns a TFOptimizer constructor callable for the given optimizer name. - - Args: - optimizer: The name of the optimizer, one of 'adadelta', 'adagrad', 'adam', 'nadam', - 'gradient_descent', 'momentum', 'rmsprop'. - - Returns: - The TFOptimizer constructor callable. - """ - def wrapper(**kwargs): - return TFOptimizer(optimizer=optimizer, **kwargs) - return wrapper - - def __init__(self, optimizer, scope=None, summary_labels=(), **kwargs): - """ - Creates a new optimizer instance of a TensorFlow optimizer. - - Args: - optimizer: The name of the optimizer, one of 'adadelta', 'adagrad', 'adam', 'nadam', - 'gradient_descent', 'momentum', 'rmsprop'. - **kwargs: Additional arguments passed on to the TensorFlow optimizer constructor. - """ - self.optimizer_spec = optimizer - self.optimizer = TFOptimizer.tf_optimizers[optimizer](**kwargs) - - super(TFOptimizer, self).__init__(scope=(scope or optimizer), summary_labels=summary_labels) - - def tf_step( - self, - time, - variables, - arguments, - fn_loss, - **kwargs + def __init__( + self, *, optimizer, learning_rate, gradient_norm_clipping=None, name=None, + arguments_spec=None, **kwargs ): - """ - Creates the TensorFlow operations for performing an optimization step. - - Args: - time: Time tensor. - variables: List of variables to optimize. - arguments: Dict of arguments for callables, like fn_loss. - fn_loss: A callable returning the loss of the current model. - **kwargs: Additional arguments, not used. - - Returns: - List of delta tensors corresponding to the updates for each optimized variable. - """ - loss = fn_loss(**arguments) - - with tf.control_dependencies(control_inputs=(loss,)): - # Trivial operation to enforce control dependency - previous_variables = [variable + 0.0 for variable in variables] - - with tf.control_dependencies(control_inputs=previous_variables): - applied = self.optimizer.minimize(loss=loss, var_list=variables) # colocate_gradients_with_ops=True - - with tf.control_dependencies(control_inputs=(applied,)): - return [ - variable - previous_variable - for variable, previous_variable in zip(variables, previous_variables) - ] - - def get_variables(self): - optimizer_variables = super(TFOptimizer, self).get_variables() - - slots_variables = [ - self.optimizer._slots[slot][key] - for slot in sorted(self.optimizer._slots) - for key in sorted(self.optimizer._slots[slot]) - ] - - if self.optimizer_spec in ('adam', 'nadam'): - additional_variables = [self.optimizer._beta1_power, self.optimizer._beta2_power] - else: - additional_variables = list() + super().__init__(name=name, arguments_spec=arguments_spec) + + assert optimizer in tensorflow_optimizers + self.tf_optimizer = tensorflow_optimizers[optimizer] - return optimizer_variables + slots_variables + additional_variables + self.learning_rate = self.submodule( + name='learning_rate', module=learning_rate, modules=parameter_modules, dtype='float', + min_value=0.0 + ) + + if gradient_norm_clipping is None: + self.gradient_norm_clipping = None + else: + self.gradient_norm_clipping = self.submodule( + name='gradient_norm_clipping', module=gradient_norm_clipping, + modules=parameter_modules, dtype='float', min_value=0.0 + ) + + self.optimizer_kwargs = kwargs + + def compose(function1, function2): + def composed(*args, **kwargs): + return function1(function2(*args, **kwargs)) + return composed + + if 'decoupled_weight_decay' in self.optimizer_kwargs: + decoupled_weight_decay = self.optimizer_kwargs.pop('decoupled_weight_decay') + self.tf_optimizer = partial( + tfa.optimizers.extend_with_decoupled_weight_decay(base_optimizer=self.tf_optimizer), + weight_decay=decoupled_weight_decay + ) + if 'lookahead' in self.optimizer_kwargs: + lookahead = self.optimizer_kwargs.pop('lookahead') + if isinstance(lookahead, dict) or lookahead is True: + if lookahead is True: + lookahead = dict() + self.tf_optimizer = compose( + function1=partial(tfa.optimizers.Lookahead, name=self.name, **lookahead), + function2=self.tf_optimizer + ) + if 'moving_average' in self.optimizer_kwargs: + moving_avg = self.optimizer_kwargs.pop('moving_average') + if isinstance(moving_avg, dict) or moving_avg is True: + if moving_avg is True: + moving_avg = dict() + self.tf_optimizer = compose( + function1=partial(tfa.optimizers.MovingAverage, name=self.name, **moving_avg), + function2=self.tf_optimizer + ) + + def initialize(self): + super().initialize() + + self.tf_optimizer = self.tf_optimizer( + learning_rate=self.learning_rate.value, name='tf_optimizer', **self.optimizer_kwargs + ) + + self.register_summary(label='update-norm', name='unclipped-gradient-norm') + + def initialize_given_variables(self, *, variables): + super().initialize_given_variables(variables=variables) + + try: + self.tf_optimizer._create_all_weights(var_list=variables) + except AttributeError: + self.tf_optimizer._create_hypers() + self.tf_optimizer._create_slots(var_list=variables) + + @tf_function(num_args=1) + def step(self, *, arguments, variables, fn_loss, **kwargs): + # Trivial operation to enforce control dependency + previous_values = list(tf_util.identity(input=variable) for variable in variables) + + # Remember variables before update + with tf.control_dependencies(control_inputs=previous_values): + + with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape: + for variable in variables: + tape.watch(tensor=variable) + loss = fn_loss(**arguments.to_kwargs()) + + gradients = tape.gradient(target=loss, sources=variables) # , output_gradients=initial + + assertions = list() + gradients = list(gradients) + grads_and_vars = list(zip(gradients, variables)) + for n in range(len(gradients) - 1, -1, -1): + if gradients[n] is None: + gradients.pop(n) + grads_and_vars.pop(n) + elif self.config.create_tf_assertions: + assertions.append(tf.debugging.assert_all_finite( + x=gradients[n], message="Invalid gradient: contains inf or nan." + )) + assert len(gradients) > 0 + + with tf.control_dependencies(control_inputs=assertions): + + dependencies = list() + if self.gradient_norm_clipping is not None: + clip_norm = self.gradient_norm_clipping.value() + gradients, grads_norm = tf.clip_by_global_norm( + t_list=[tf_util.cast(x=g, dtype='float') for g in gradients], + clip_norm=clip_norm + ) + dependencies.extend(self.summary( + label='update-norm', name='unclipped-gradient-norm', data=grads_norm, + step='updates' + )) + grads_and_vars = [(grad, var) for grad, (_, var) in zip(gradients, grads_and_vars)] + + applied = self.tf_optimizer.apply_gradients(grads_and_vars=grads_and_vars) + dependencies.append(applied) + + # Return deltas after actually having change the variables. + with tf.control_dependencies(control_inputs=dependencies): + return [variable - previous for variable, previous in zip(variables, previous_values)] diff --git a/tensorforce/core/optimizers/update_modifier.py b/tensorforce/core/optimizers/update_modifier.py new file mode 100644 index 000000000..074ae7c57 --- /dev/null +++ b/tensorforce/core/optimizers/update_modifier.py @@ -0,0 +1,38 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorforce.core +from tensorforce.core.optimizers import Optimizer + + +class UpdateModifier(Optimizer): + """ + Update modifier, which takes the update mechanism implemented by another optimizer and modifies + it. + + Args: + optimizer (specification): Optimizer configuration + (required). + name (string): (internal use). + arguments_spec (specification): internal use. + """ + + def __init__(self, *, optimizer, name=None, arguments_spec=None): + super().__init__(name=name, arguments_spec=arguments_spec) + + self.optimizer = self.submodule( + name=name, module=optimizer, modules=tensorforce.core.optimizer_modules, + arguments_spec=self.arguments_spec + ) diff --git a/tensorforce/core/parameters/__init__.py b/tensorforce/core/parameters/__init__.py new file mode 100644 index 000000000..fe9ed7f2e --- /dev/null +++ b/tensorforce/core/parameters/__init__.py @@ -0,0 +1,46 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from functools import partial + +from tensorforce.core.parameters.parameter import Parameter + +from tensorforce.core.parameters.constant import Constant +from tensorforce.core.parameters.decaying import Decaying +from tensorforce.core.parameters.exponential import Exponential +from tensorforce.core.parameters.linear import Linear +from tensorforce.core.parameters.ornstein_uhlenbeck import OrnsteinUhlenbeck +from tensorforce.core.parameters.piecewise_constant import PiecewiseConstant +from tensorforce.core.parameters.random import Random + + +parameter_modules = dict( + constant=Constant, decaying=Decaying, default=Constant, exponential=Exponential, linear=Linear, + ornstein_uhlenbeck=OrnsteinUhlenbeck, piecewise_constant=PiecewiseConstant, random=Random +) + + +for name in ( + 'polynomial', 'inverse_time', 'cosine', 'cosine_restarts', 'linear_cosine', + 'linear_cosine_noisy' +): + assert name not in parameter_modules + parameter_modules[name] = partial(Decaying, decay=name) + + +__all__ = [ + 'Constant', 'Decaying', 'Exponential', 'Linear', 'OrnsteinUhlenbeck', 'Parameter', + 'parameter_modules', 'PiecewiseConstant', 'Random' +] diff --git a/tensorforce/core/parameters/constant.py b/tensorforce/core/parameters/constant.py new file mode 100644 index 000000000..ef1a8bcf7 --- /dev/null +++ b/tensorforce/core/parameters/constant.py @@ -0,0 +1,76 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import TensorforceError +from tensorforce.core import tf_function, tf_util +from tensorforce.core.parameters import Parameter + + +class Constant(Parameter): + """ + Constant hyperparameter (specification key: `constant`). + + Args: + value (float | int | bool): Constant hyperparameter value + (required). + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + # Argument 'value' first for default specification + def __init__(self, value, *, name=None, dtype=None, min_value=None, max_value=None): + if isinstance(value, bool): + if dtype != 'bool': + raise TensorforceError.dtype(name='Constant', argument='value', dtype=type(value)) + elif isinstance(value, int): + if dtype != 'int': + raise TensorforceError.dtype(name='Constant', argument='value', dtype=type(value)) + elif isinstance(value, float): + if dtype != 'float': + raise TensorforceError.dtype(name='Constant', argument='value', dtype=type(value)) + else: + raise TensorforceError.unexpected() + if min_value is not None and value < min_value: + raise TensorforceError.value( + name='Constant', argument='value', value=value, + hint='< {} lower bound'.format(min_value) + ) + if max_value is not None and value > max_value: + raise TensorforceError.value( + name='Constant', argument='value', value=value, + hint='> {} upper bound'.format(max_value) + ) + + self.constant_value = value + + super().__init__(name=name, dtype=dtype, min_value=min_value, max_value=max_value) + + def min_value(self): + return self.constant_value + + def max_value(self): + return self.constant_value + + def final_value(self): + return self.constant_value + + def initialize(self): + super(Parameter, self).initialize() + + @tf_function(num_args=0) + def value(self): + return tf_util.constant(value=self.constant_value, dtype=self.spec.type) diff --git a/tensorforce/core/parameters/decaying.py b/tensorforce/core/parameters/decaying.py new file mode 100644 index 000000000..215f419c2 --- /dev/null +++ b/tensorforce/core/parameters/decaying.py @@ -0,0 +1,411 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError, util +from tensorforce.core import tf_util +from tensorforce.core.parameters import Parameter + + +class Decaying(Parameter): + """ + Decaying hyperparameter (specification key: `decaying`, `linear`, `exponential`, `polynomial`, + `inverse_time`, `cosine`, `cosine_restarts`, `linear_cosine`, `linear_cosine_noisy`). + + Args: + decay ("linear" | "exponential" | "polynomial" | "inverse_time" | "cosine" | "cosine_restarts" | "linear_cosine" | "linear_cosine_noisy"): + Decay type, see also + `TensorFlow docs `__ + (required). + unit ("timesteps" | "episodes" | "updates"): Unit of decay schedule + (required). + num_steps (int): Number of decay steps + (required). + initial_value (float | int): Initial value + (required). + increasing (bool): Whether to subtract the decayed value from 1.0 + (default: false). + inverse (bool): Whether to take the inverse of the decayed value + (default: false). + scale (float): Scaling factor for (inverse) decayed value + (default: 1.0). + kwargs: Additional arguments depend on decay mechanism.
                + Linear decay: +
                  +
                • final_value (float | int) – Final value + (required).
                • +
                + Exponential decay: +
                  +
                • decay_rate (float) – Decay rate + (required).
                • +
                • staircase (bool) – Whether to apply decay in a discrete + staircase, as opposed to continuous, fashion. + (default: false).
                • +
                + Polynomial decay: +
                  +
                • final_value (float | int) – Final value + (required).
                • +
                • power (float | int) – Power of polynomial + (default: 1, thus linear).
                • +
                • cycle (bool) – Whether to cycle beyond num_steps + (default: false).
                • +
                + Inverse time decay: +
                  +
                • decay_rate (float) – Decay rate + (required).
                • +
                • staircase (bool) – Whether to apply decay in a discrete + staircase, as opposed to continuous, fashion. + (default: false).
                • +
                + Cosine decay: +
                  +
                • alpha (float) – Minimum learning rate value as a fraction of + learning_rate + (default: 0.0).
                • +
                + Cosine decay with restarts: +
                  +
                • t_mul (float) – Used to derive the number of iterations in the + i-th period + (default: 2.0).
                • +
                • m_mul (float) – Used to derive the initial learning rate of the + i-th period + (default: 1.0).
                • +
                • alpha (float) – Minimum learning rate value as a fraction of + the learning_rate + (default: 0.0).
                • +
                + Linear cosine decay: +
                  +
                • num_periods (float) – Number of periods in the cosine part of + the decay + (default: 0.5).
                • +
                • alpha (float) – Alpha value + (default: 0.0).
                • +
                • beta (float) – Beta value + (default: 0.001).
                • +
                + Noisy linear cosine decay: +
                  +
                • initial_variance (float) – Initial variance for the noise + (default: 1.0).
                • +
                • variance_decay (float) – Decay for the noise's variance + (default: 0.55).
                • +
                • num_periods (float) – Number of periods in the cosine part of + the decay + (default: 0.5).
                • +
                • alpha (float) – Alpha value + (default: 0.0).
                • +
                • beta (float) – Beta value + (default: 0.001).
                • +
                + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, decay, unit, num_steps, initial_value, increasing=False, inverse=False, scale=1.0, + name=None, dtype=None, min_value=None, max_value=None, **kwargs + ): + assert decay in ( + 'cosine', 'cosine_restarts', 'exponential', 'inverse_time', 'linear', 'linear_cosine', + 'linear_cosine_noisy', 'polynomial' + ) + assert unit in ('timesteps', 'episodes', 'updates') + assert (isinstance(num_steps, int) or num_steps % 10.0 == 0.0) and num_steps > 0 + assert isinstance(initial_value, (float, int)) + + if isinstance(initial_value, int): + if dtype != 'int': + raise TensorforceError.dtype( + name='Decaying', argument='initial_value', dtype=type(initial_value) + ) + elif isinstance(initial_value, float): + if dtype != 'float': + raise TensorforceError.dtype( + name='Decaying', argument='initial_value', dtype=type(initial_value) + ) + else: + raise TensorforceError.unexpected() + + if decay == 'linear': + assert len(kwargs) == 1 and 'final_value' in kwargs + decay = 'polynomial' + + self.decay = decay + self.num_steps = int(num_steps) + self.initial_value = initial_value + self.increasing = increasing + self.inverse = inverse + self.scale = scale + self.kwargs = kwargs + + super().__init__( + unit=unit, name=name, dtype=dtype, min_value=min_value, max_value=max_value + ) + + def min_value(self): + if self.decay == 'cosine' or self.decay == 'cosine_restarts': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + if self.initial_value >= 0.0: + min_value = self.initial_value * self.kwargs.get('alpha', 0.0) + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = self.initial_value * self.kwargs.get('alpha', 0.0) + + elif self.decay == 'exponential' or self.decay == 'inverse_time': + assert 0.0 <= self.kwargs['decay_rate'] <= 1.0 + if self.kwargs['decay_rate'] == 1.0: + min_value = max_value = self.initial_value + elif self.initial_value >= 0.0: + min_value = 0.0 + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = 0.0 + + elif self.decay == 'linear_cosine' or self.decay == 'linear_cosine_noisy': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + assert 0.0 <= self.kwargs.get('beta', 0.0) <= 1.0 + if self.initial_value >= 0.0: + min_value = self.initial_value * self.kwargs.get('beta', 0.001) + max_value = self.initial_value * ( + 1.0 + self.kwargs.get('alpha', 0.0) + self.kwargs.get('beta', 0.001) + ) + else: + min_value = self.initial_value * ( + 1.0 + self.kwargs.get('alpha', 0.0) + self.kwargs.get('beta', 0.001) + ) + max_value = self.initial_value * self.kwargs.get('beta', 0.001) + + elif self.decay == 'polynomial': + if self.kwargs.get('power', 1.0) == 0.0: + min_value = max_value = self.initial_value + elif self.initial_value >= self.kwargs['final_value']: + min_value = self.kwargs['final_value'] + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = self.kwargs['final_value'] + + assert min_value <= max_value + + if self.increasing: + assert 0.0 <= min_value <= max_value <= 1.0 + min_value, max_value = 1.0 - max_value, 1.0 - min_value + + if self.inverse: + assert util.epsilon <= min_value <= max_value + min_value, max_value = 1.0 / max_value, 1.0 / min_value + + if self.scale == 1.0: + pass + elif self.scale >= 0.0: + min_value, max_value = self.scale * min_value, self.scale * max_value + else: + min_value, max_value = self.scale * max_value, self.scale * min_value + + return self.spec.py_type()(min_value) + + def max_value(self): + if self.decay == 'cosine' or self.decay == 'cosine_restarts': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + if self.initial_value >= 0.0: + min_value = self.initial_value * self.kwargs.get('alpha', 0.0) + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = self.initial_value * self.kwargs.get('alpha', 0.0) + + elif self.decay == 'exponential' or self.decay == 'inverse_time': + assert 0.0 <= self.kwargs['decay_rate'] <= 1.0 + if self.kwargs['decay_rate'] == 1.0: + min_value = max_value = self.initial_value + elif self.initial_value >= 0.0: + min_value = 0.0 + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = 0.0 + + elif self.decay == 'linear_cosine' or self.decay == 'linear_cosine_noisy': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + assert 0.0 <= self.kwargs.get('beta', 0.0) <= 1.0 + if self.initial_value >= 0.0: + min_value = self.initial_value * self.kwargs.get('beta', 0.001) + max_value = self.initial_value * ( + 1.0 + self.kwargs.get('alpha', 0.0) + self.kwargs.get('beta', 0.001) + ) + else: + min_value = self.initial_value * ( + 1.0 + self.kwargs.get('alpha', 0.0) + self.kwargs.get('beta', 0.001) + ) + max_value = self.initial_value * self.kwargs.get('beta', 0.001) + + elif self.decay == 'polynomial': + if self.kwargs.get('power', 1.0) == 0.0: + min_value = max_value = self.initial_value + elif self.initial_value >= self.kwargs['final_value']: + min_value = self.kwargs['final_value'] + max_value = self.initial_value + else: + min_value = self.initial_value + max_value = self.kwargs['final_value'] + + assert min_value <= max_value + + if self.increasing: + assert 0.0 <= min_value <= max_value <= 1.0 + min_value, max_value = 1.0 - max_value, 1.0 - min_value + + if self.inverse: + assert 0.0 < min_value <= max_value + min_value, max_value = 1.0 / max_value, 1.0 / min_value + + if self.scale == 1.0: + pass + elif self.scale >= 0.0: + min_value, max_value = self.scale * min_value, self.scale * max_value + else: + min_value, max_value = self.scale * max_value, self.scale * min_value + + return self.spec.py_type()(max_value) + + def final_value(self): + if self.decay == 'cosine' or self.decay == 'cosine_restarts': + assert 0.0 <= self.kwargs['decay_rate'] <= 1.0 + value = self.initial_value * self.kwargs.get('alpha', 0.0) + + elif self.decay == 'exponential' or self.decay == 'inverse_time': + assert 0.0 <= self.kwargs['decay_rate'] <= 1.0 + if self.kwargs['decay_rate'] == 1.0: + value = self.initial_value + else: + value = 0.0 + + elif self.decay == 'linear_cosine' or self.decay == 'linear_cosine_noisy': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + assert 0.0 <= self.kwargs.get('beta', 0.0) <= 1.0 + value = self.initial_value * self.kwargs.get('beta', 0.001) + + elif self.decay == 'polynomial': + if self.kwargs.get('power', 1.0) == 0.0: + value = self.initial_value + else: + value = self.kwargs['final_value'] + + if self.increasing: + assert 0.0 <= value <= 1.0 + value = 1.0 - value + + if self.inverse: + assert value > 0.0 + value = 1.0 / value + + if self.scale != 1.0: + value = value * self.scale + + return self.spec.py_type()(value) + + def parameter_value(self, *, step): + initial_value = tf_util.constant(value=self.initial_value, dtype='float') + + if self.decay == 'cosine': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + parameter = tf.keras.experimental.CosineDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + alpha=self.kwargs.get('alpha', 0.0) + )(step=step) + + elif self.decay == 'cosine_restarts': + assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 + parameter = tf.keras.experimental.CosineDecayRestarts( + initial_learning_rate=initial_value, first_decay_steps=(self.num_steps + 1), + t_mul=self.kwargs.get('t_mul', 2.0), m_mul=self.kwargs.get('m_mul', 1.0), + alpha=self.kwargs.get('alpha', 0.0) + )(step=step) + + elif self.decay == 'exponential': + assert self.kwargs['decay_rate'] >= 0.0 + parameter = tf.keras.optimizers.schedules.ExponentialDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + decay_rate=self.kwargs['decay_rate'], staircase=self.kwargs.get('staircase', False) + )(step=step) + + elif self.decay == 'inverse_time': + assert self.kwargs['decay_rate'] >= 0.0 + parameter = tf.keras.optimizers.schedules.InverseTimeDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + decay_rate=self.kwargs['decay_rate'], staircase=self.kwargs.get('staircase', False) + )(step=step) + + elif self.decay == 'linear_cosine': + assert self.kwargs.get('beta', 0.001) >= 0.0 + parameter = tf.keras.experimental.LinearCosineDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + num_periods=self.kwargs.get('num_periods', 0.5), + alpha=self.kwargs.get('alpha', 0.0), beta=self.kwargs.get('beta', 0.001) + )(step=step) + + elif self.decay == 'linear_cosine_noisy': + assert self.kwargs.get('beta', 0.001) >= 0.0 + parameter = tf.keras.experimental.NoisyLinearCosineDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + initial_variance=self.kwargs.get('initial_variance', 1.0), + variance_decay=self.kwargs.get('variance_decay', 0.55), + num_periods=self.kwargs.get('num_periods', 0.5), + alpha=self.kwargs.get('alpha', 0.0), beta=self.kwargs.get('beta', 0.001) + )(step=step) + + elif self.decay == 'polynomial': + assert self.kwargs.get('power', 1.0) >= 0.0 + parameter = tf.keras.optimizers.schedules.PolynomialDecay( + initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), + end_learning_rate=self.kwargs['final_value'], power=self.kwargs.get('power', 1.0), + cycle=self.kwargs.get('cycle', False) + )(step=step) + + if self.increasing: + one = tf_util.constant(value=1.0, dtype='float') + assertions = list() + if self.config.create_tf_assertions: + zero = tf_util.constant(value=0.0, dtype='float') + assertions.append(tf.debugging.assert_greater_equal(x=parameter, y=zero)) + assertions.append(tf.debugging.assert_less_equal(x=parameter, y=one)) + with tf.control_dependencies(control_inputs=assertions): + parameter = one - parameter + + if self.inverse: + zero = tf_util.constant(value=0.0, dtype='float') + epsilon = tf_util.constant(value=util.epsilon, dtype='float') + parameter = tf.where( + condition=(parameter > zero), x=(parameter + epsilon), y=(parameter - epsilon) + ) + parameter = tf.math.reciprocal(x=parameter) + + if self.scale != 1.0: + scale = tf_util.constant(value=self.scale, dtype='float') + parameter = parameter * scale + + parameter = tf_util.cast(x=parameter, dtype=self.spec.type) + + return parameter diff --git a/tensorforce/core/parameters/exponential.py b/tensorforce/core/parameters/exponential.py new file mode 100644 index 000000000..edd54ea5e --- /dev/null +++ b/tensorforce/core/parameters/exponential.py @@ -0,0 +1,52 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import tf_util +from tensorforce.core.parameters import Decaying + + +class Exponential(Decaying): + """ + Exponentially decaying hyperparameter (specification key: `exponential`). + + Args: + unit ("timesteps" | "episodes" | "updates"): Unit of decay schedule + (required). + num_steps (int): Number of decay steps + (required). + initial_value (float): Initial value + (required). + decay_rate (float): Decay rate + (required). + staircase (bool): Whether to apply decay in a discrete staircase, as opposed to continuous, + fashion (default: false). + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, unit, num_steps, initial_value, decay_rate, staircase=False, name=None, dtype=None, + min_value=None, max_value=None, **kwargs + ): + super().__init__( + decay='exponential', unit=unit, num_steps=num_steps, initial_value=initial_value, + name=name, dtype=dtype, min_value=min_value, max_value=max_value, decay_rate=decay_rate, + staircase=staircase, **kwargs + ) diff --git a/tensorforce/core/parameters/linear.py b/tensorforce/core/parameters/linear.py new file mode 100644 index 000000000..9589beed1 --- /dev/null +++ b/tensorforce/core/parameters/linear.py @@ -0,0 +1,45 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core.parameters import Decaying + + +class Linear(Decaying): + """ + Linear hyperparameter (specification key: `linear`). + + Args: + unit ("timesteps" | "episodes" | "updates"): Unit of decay schedule + (required). + num_steps (int): Number of decay steps + (required). + initial_value (float): Initial value + (required). + final_value (float): Final value + (required). + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, unit, num_steps, initial_value, final_value, name=None, dtype=None, min_value=None, + max_value=None + ): + super().__init__( + decay='linear', unit=unit, num_steps=num_steps, initial_value=initial_value, name=name, + dtype=dtype, min_value=min_value, max_value=max_value, final_value=final_value + ) diff --git a/tensorforce/core/parameters/ornstein_uhlenbeck.py b/tensorforce/core/parameters/ornstein_uhlenbeck.py new file mode 100644 index 000000000..8c71ec3c5 --- /dev/null +++ b/tensorforce/core/parameters/ornstein_uhlenbeck.py @@ -0,0 +1,78 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import TensorSpec, tf_util +from tensorforce.core.parameters import Parameter + + +class OrnsteinUhlenbeck(Parameter): + """ + Ornstein-Uhlenbeck process (specification key: `ornstein_uhlenbeck`). + + Args: + theta (float > 0.0): Theta value + (default: 0.15). + sigma (float > 0.0): Sigma value + (default: 0.3). + mu (float): Mu value + (default: 0.0). + absolute (bool): Absolute value + (default: false). + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, theta=0.15, sigma=0.3, mu=0.0, absolute=False, name=None, dtype=None, + min_value=None, max_value=None + ): + self.theta = theta + self.mu = mu + self.sigma = sigma + self.absolute = absolute + + super().__init__(name=name, dtype=dtype, min_value=min_value, max_value=max_value) + + def min_value(self): + if self.absolute: + return self.spec.py_type()(0.0) + else: + super().min_value() + + def final_value(self): + return self.spec.py_type()(self.mu) + + def initialize(self): + super().initialize() + + self.process = self.variable( + name='process', spec=TensorSpec(type='float'), initializer=self.mu, is_trainable=False, + is_saved=True + ) + + def parameter_value(self, *, step): + delta = self.theta * (self.mu - self.process) + self.sigma * tf.random.normal(shape=()) + if self.absolute: + parameter = self.process.assign(value=tf.math.abs(x=(self.process + delta))) + else: + parameter = self.process.assign_add(delta=delta) + + parameter = tf_util.cast(x=parameter, dtype=self.spec.type) + + return parameter diff --git a/tensorforce/core/parameters/parameter.py b/tensorforce/core/parameters/parameter.py new file mode 100644 index 000000000..f350f3720 --- /dev/null +++ b/tensorforce/core/parameters/parameter.py @@ -0,0 +1,144 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import Module, SignatureDict, TensorSpec, tf_function, tf_util + + +class Parameter(Module): + """ + Base class for dynamic hyperparameters. + + Args: + unit ("timesteps" | "episodes" | "updates"): Unit of parameter schedule + (default: timesteps). + name (string): internal use. + dtype (type): internal use. + shape (iter[int > 0]): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, unit='timesteps', name=None, dtype=None, shape=(), min_value=None, max_value=None + ): + super().__init__(name=name) + + assert unit in (None, 'timesteps', 'episodes', 'updates') + self.unit = unit + + self.spec = TensorSpec(type=dtype, shape=shape, min_value=min_value, max_value=max_value) + + assert self.min_value() is None or self.max_value() is None or \ + self.min_value() <= self.max_value() + if self.spec.min_value is not None: + if self.min_value() is None: + raise TensorforceError.value( + name=self.name, argument='lower bound', value=self.min_value(), + hint=('not >= {}'.format(self.spec.min_value)) + ) + elif self.min_value() < self.spec.min_value: + raise TensorforceError.value( + name=self.name, argument='lower bound', value=self.min_value(), + hint=('< {}'.format(self.spec.min_value)) + ) + if self.spec.max_value is not None: + if self.max_value() is None: + raise TensorforceError.value( + name=self.name, argument='upper bound', value=self.max_value(), + hint=('not <= {}'.format(self.spec.max_value)) + ) + elif self.max_value() > self.spec.max_value: + raise TensorforceError.value( + name=self.name, argument='upper bound', value=self.max_value(), + hint=('> {}'.format(self.spec.max_value)) + ) + + def min_value(self): + return None + + def max_value(self): + return None + + def is_constant(self, *, value=None): + if value is None: + if self.min_value() is not None and self.min_value() == self.max_value(): + assert self.final_value() == self.min_value() + assert isinstance(self.final_value(), self.spec.py_type()) + return self.final_value() + else: + return None + else: + assert isinstance(value, self.spec.py_type()) + if self.min_value() == value and self.max_value() == value: + assert self.final_value() == value + return True + else: + return False + + def final_value(self): + raise NotImplementedError + + def initialize(self): + super().initialize() + + self.register_summary(label='parameters', name=('parameters/' + self.name)) + + self.register_tracking(label='parameters', name=self.name, spec=self.spec) + + def input_signature(self, *, function): + if function == 'value': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'value': + return SignatureDict(singleton=self.spec.signature(batched=False)) + + else: + return super().output_signature(function=function) + + def parameter_value(self, *, step): + raise NotImplementedError + + @tf_function(num_args=0) + def value(self): + if self.unit is None: + step = None + else: + step = self.root.units[self.unit] + + parameter = self.parameter_value(step=step) + + dependencies = self.spec.tf_assert( + x=parameter, include_type_shape=True, + message='Parameter.value: invalid {{issue}} for {name} value.'.format(name=self.name) + ) + + name = 'parameters/' + self.name + if self.unit is None: + step = 'timesteps' + else: + step = self.unit + dependencies.extend(self.summary(label='parameters', name=name, data=parameter, step=step)) + + dependencies.extend(self.track(label='parameters', name=self.name, data=parameter)) + + with tf.control_dependencies(control_inputs=dependencies): + return tf_util.identity(input=parameter) diff --git a/tensorforce/core/parameters/piecewise_constant.py b/tensorforce/core/parameters/piecewise_constant.py new file mode 100644 index 000000000..0a418fa38 --- /dev/null +++ b/tensorforce/core/parameters/piecewise_constant.py @@ -0,0 +1,83 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import tf_util +from tensorforce.core.parameters import Parameter + + +class PiecewiseConstant(Parameter): + """ + Piecewise-constant hyperparameter (specification key: `piecewise_constant`). + + Args: + unit ("timesteps" | "episodes" | "updates"): Unit of interval boundaries + (required). + boundaries (iter[long]): Strictly increasing interval boundaries for constant segments + (required). + values (iter[dtype-dependent]): Interval values of constant segments, one more than + (required). + name (string): internal use. + dtype (type): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, unit, boundaries, values, name=None, dtype=None, min_value=None, max_value=None + ): + if isinstance(values[0], bool): + if dtype != 'bool': + raise TensorforceError.unexpected() + elif isinstance(values[0], int): + if dtype != 'int': + raise TensorforceError.unexpected() + elif isinstance(values[0], float): + if dtype != 'float': + raise TensorforceError.unexpected() + else: + raise TensorforceError.unexpected() + + assert unit in ('timesteps', 'episodes', 'updates') + assert len(values) == len(boundaries) + 1 + assert boundaries == sorted(boundaries) and boundaries[0] > 0 + assert all(isinstance(value, type(values[0])) for value in values) + + self.boundaries = boundaries + self.values = values + + super().__init__( + unit=unit, name=name, dtype=dtype, min_value=min_value, max_value=max_value + ) + + def min_value(self): + return min(self.values) + + def max_value(self): + return max(self.values) + + def final_value(self): + return self.values[-1] + + def parameter_value(self, *, step): + parameter = tf.keras.optimizers.schedules.PiecewiseConstantDecay( + boundaries=self.boundaries, values=self.values + )(step=step) + + parameter = tf_util.cast(x=parameter, dtype=self.spec.type) + + return parameter diff --git a/tensorforce/core/parameters/random.py b/tensorforce/core/parameters/random.py new file mode 100644 index 000000000..0c1bfc5e2 --- /dev/null +++ b/tensorforce/core/parameters/random.py @@ -0,0 +1,104 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core.parameters import Parameter + + +class Random(Parameter): + """ + Random hyperparameter (specification key: `random`). + + Args: + distribution ("normal" | "uniform"): Distribution type for random hyperparameter value + (required). + kwargs: Additional arguments dependent on distribution type.
                + Normal distribution: +
                  +
                • mean (float) – Mean + (default: 0.0).
                • +
                • stddev (float > 0.0) – Standard deviation + (default: 1.0).
                • +
                + Uniform distribution: +
                  +
                • minval (int / float) – Lower bound + (default: 0 / 0.0).
                • +
                • maxval (float > minval) – Upper bound + (default: 1.0 for float, + required for int).
                • +
                + name (string): internal use. + dtype (type): internal use. + shape (iter[int > 0]): internal use. + min_value (dtype-compatible value): internal use. + max_value (dtype-compatible value): internal use. + """ + + def __init__( + self, *, distribution, name=None, dtype=None, shape=(), min_value=None, max_value=None, + **kwargs + ): + assert dtype in ('int', 'float') + assert distribution in ('normal', 'uniform') + + self.distribution = distribution + self.kwargs = kwargs + + super().__init__( + name=name, dtype=dtype, shape=shape, min_value=min_value, max_value=max_value + ) + + def min_value(self): + if self.distribution == 'uniform': + return self.spec.py_type()(self.kwargs.get('minval', 0)) + + else: + return super().min_value() + + def max_value(self): + if self.distribution == 'uniform': + return self.spec.py_type()(self.kwargs.get('maxval', 1.0)) + + else: + return super().max_value() + + def final_value(self): + if self.distribution == 'normal': + return self.spec.py_type()(self.kwargs.get('mean', 0.0)) + + elif self.distribution == 'uniform': + return self.spec.py_type()( + (self.kwargs.get('maxval', 1.0) + self.kwargs.get('minval', 0.0)) / 2.0 + ) + + else: + return super().final_value() + + def parameter_value(self, *, step): + if self.distribution == 'normal': + parameter = tf.random.normal( + shape=self.spec.shape, dtype=self.spec.tf_type(), mean=self.kwargs.get('mean', 0.0), + stddev=self.kwargs.get('stddev', 1.0) + ) + + elif self.distribution == 'uniform': + parameter = tf.random.uniform( + shape=self.spec.shape, dtype=self.spec.tf_type(), + minval=self.kwargs.get('minval', 0), maxval=self.kwargs.get('maxval') + ) + + return parameter diff --git a/tensorforce/core/policies/__init__.py b/tensorforce/core/policies/__init__.py new file mode 100644 index 000000000..4fba43292 --- /dev/null +++ b/tensorforce/core/policies/__init__.py @@ -0,0 +1,44 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core.policies.base_policy import BasePolicy +from tensorforce.core.policies.parametrized_policy import ParametrizedPolicy + +from tensorforce.core.policies.action_value import ActionValue +from tensorforce.core.policies.policy import Policy +from tensorforce.core.policies.state_value import StateValue + +from tensorforce.core.policies.stochastic_policy import StochasticPolicy +from tensorforce.core.policies.value_policy import ValuePolicy + +from tensorforce.core.policies.parametrized_action_value import ParametrizedActionValue +from tensorforce.core.policies.parametrized_distributions import ParametrizedDistributions +from tensorforce.core.policies.parametrized_state_value import ParametrizedStateValue +from tensorforce.core.policies.parametrized_value_policy import ParametrizedValuePolicy + + +policy_modules = dict( + parametrized_action_value=ParametrizedActionValue, + parametrized_distributions=ParametrizedDistributions, + parametrized_state_value=ParametrizedStateValue, + parametrized_value_policy=ParametrizedValuePolicy +) + + +__all__ = [ + 'ActionValue', 'BasePolicy', 'ParametrizedActionValue', 'ParametrizedDistributions', 'ParametrizedPolicy', + 'ParametrizedStateValue', 'ParametrizedValuePolicy', 'Policy', 'StateValue', 'StochasticPolicy', + 'ValuePolicy' +] diff --git a/tensorforce/core/policies/action_value.py b/tensorforce/core/policies/action_value.py new file mode 100644 index 000000000..ef4e64f84 --- /dev/null +++ b/tensorforce/core/policies/action_value.py @@ -0,0 +1,68 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import SignatureDict, TensorSpec, tf_function +from tensorforce.core.policies import BasePolicy + + +class ActionValue(BasePolicy): + """ + Base class for action-value functions, here categorized as "degenerate" policy. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + def __init__( + self, *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, actions_spec=None + ): + BasePolicy.__init__( + self=self, device=device, l2_regularization=l2_regularization, name=name, + states_spec=states_spec, auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + def input_signature(self, *, function): + if function == 'action_value': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'action_value': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=5) + def action_value(self, *, states, horizons, internals, auxiliaries, actions): + raise NotImplementedError diff --git a/tensorforce/core/policies/base_policy.py b/tensorforce/core/policies/base_policy.py new file mode 100644 index 000000000..0b9c8f0ee --- /dev/null +++ b/tensorforce/core/policies/base_policy.py @@ -0,0 +1,92 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import Module, SignatureDict, TensorDict, TensorSpec, TensorsSpec, tf_function + + +class BasePolicy(Module): + """ + Base class for decision policies and "degenerate" value functions. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + def __init__( + self, *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, actions_spec=None + ): + super().__init__(device=device, l2_regularization=l2_regularization, name=name) + + self.states_spec = states_spec + self.auxiliaries_spec = auxiliaries_spec + self.actions_spec = actions_spec + + @property + def internals_spec(self): + return TensorsSpec() + + def internals_init(self): + return TensorDict() + + def max_past_horizon(self, *, on_policy): + raise NotImplementedError + + def input_signature(self, *, function): + if function == 'next_internals': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'past_horizon': + return SignatureDict() + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'next_internals': + return SignatureDict(singleton=self.internals_spec.signature(batched=True)) + + elif function == 'past_horizon': + return SignatureDict( + singleton=TensorSpec(type='int', shape=()).signature(batched=False) + ) + + else: + return super().output_signature(function=function) + + # TODO: should be only required for Policy + def get_savedmodel_trackables(self): + raise NotImplementedError() + + @tf_function(num_args=0) + def past_horizon(self, *, on_policy): + raise NotImplementedError + + @tf_function(num_args=5) + def next_internals(self, *, states, horizons, internals, actions, deterministic, independent): + raise NotImplementedError diff --git a/tensorforce/core/policies/parametrized_action_value.py b/tensorforce/core/policies/parametrized_action_value.py new file mode 100644 index 000000000..f2bcd074c --- /dev/null +++ b/tensorforce/core/policies/parametrized_action_value.py @@ -0,0 +1,109 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import layer_modules, TensorDict, TensorsSpec, tf_function, tf_util +from tensorforce.core.policies import ActionValue, ParametrizedPolicy + + +class ParametrizedActionValue(ActionValue, ParametrizedPolicy): + """ + Policy which parametrizes an action-value function, conditioned on the output of a neural + network processing the input state (specification key: `parametrized_action_value`). + + Args: + network ('auto' | specification): Policy network configuration, see + [networks](../modules/networks.html) + (default: 'auto', automatically configured + network). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + internals_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + # Network first + def __init__( + self, network='auto', *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, internals_spec=None, actions_spec=None + ): + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, states_spec=states_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + inputs_spec = TensorsSpec() + if self.states_spec.is_singleton(): + inputs_spec['states'] = self.states_spec.singleton() + else: + inputs_spec['states'] = self.states_spec + if self.actions_spec.is_singleton(): + inputs_spec['actions'] = self.actions_spec.singleton() + else: + inputs_spec['actions'] = self.actions_spec + ParametrizedPolicy.__init__(self=self, network=network, inputs_spec=inputs_spec) + output_spec = self.network.output_spec() + + # Action value + self.value = self.submodule( + name='value', module='linear', modules=layer_modules, size=0, input_spec=output_spec + ) + + def get_architecture(self): + return 'Network: {}\nAction-value: {}'.format( + self.network.get_architecture().replace('\n', '\n '), + self.value.get_architecture().replace('\n', '\n ') + ) + + @tf_function(num_args=5) + def next_internals(self, *, states, horizons, internals, actions, deterministic, independent): + inputs = TensorDict() + if self.states_spec.is_singleton(): + inputs['states'] = states.singleton() + else: + inputs['states'] = states + if self.actions_spec.is_singleton(): + inputs['actions'] = actions.singleton() + else: + inputs['actions'] = actions + + return super().next_internals( + states=inputs, horizons=horizons, internals=internals, deterministic=deterministic, + independent=independent + ) + + @tf_function(num_args=5) + def action_value(self, *, states, horizons, internals, auxiliaries, actions): + inputs = TensorDict() + if self.states_spec.is_singleton(): + inputs['states'] = states.singleton() + else: + inputs['states'] = states + if self.actions_spec.is_singleton(): + inputs['actions'] = actions.singleton() + else: + inputs['actions'] = actions + + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=inputs, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + + return self.value.apply(x=embedding) diff --git a/tensorforce/core/policies/parametrized_distributions.py b/tensorforce/core/policies/parametrized_distributions.py new file mode 100644 index 000000000..c5e064b3f --- /dev/null +++ b/tensorforce/core/policies/parametrized_distributions.py @@ -0,0 +1,441 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import distribution_modules, ModuleDict, TensorDict, TensorsSpec, \ + tf_function, tf_util +from tensorforce.core.policies import ParametrizedPolicy, StochasticPolicy, ValuePolicy + + +class ParametrizedDistributions(StochasticPolicy, ValuePolicy, ParametrizedPolicy): + """ + Policy which parametrizes independent distributions per action, conditioned on the output of a + central neural network processing the input state, supporting both a stochastic and value-based + policy interface (specification key: `parametrized_distributions`). + + Args: + network ('auto' | specification): Policy network configuration, see + [networks](../modules/networks.html) + (default: 'auto', automatically configured + network). + single_output (bool): Whether the network returns a single embedding tensor or, in the case + of multiple action components, specifies additional outputs for some/all action + distributions, via registered tensors with name "[ACTION]-embedding" + (default: single output). + distributions (dict[specification]): Distributions configuration, see + [distributions](../modules/distributions.html), specified per + action-type or -name + (default: per action-type, Bernoulli + distribution for binary boolean actions, categorical distribution for discrete integer + actions, Gaussian distribution for unbounded continuous actions, Beta distribution for + bounded continuous actions). + temperature (parameter | dict[parameter], float >= 0.0): Sampling temperature, global or + per action (default: 1.0). + use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous + actions by default. + (default: false). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + internals_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + # Network first + def __init__( + self, network='auto', *, single_output=True, distributions=None, temperature=1.0, + use_beta_distribution=False, device=None, l2_regularization=None, name=None, + states_spec=None, auxiliaries_spec=None, internals_spec=None, actions_spec=None + ): + super().__init__( + temperature=temperature, device=device, l2_regularization=l2_regularization, name=name, + states_spec=states_spec, auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + if single_output: + outputs = None + elif self.actions_spec.is_singleton(): + outputs = ('action-embedding',) + else: + outputs = tuple(name + '-embedding' for name in self.actions_spec) + ParametrizedPolicy.__init__( + self=self, network=network, inputs_spec=self.states_spec, outputs=outputs + ) + output_spec = self.network.output_spec() + if not isinstance(output_spec, TensorsSpec): + output_spec = TensorsSpec(embedding=output_spec) + + # Distributions + self.distributions = ModuleDict() + for name, spec in self.actions_spec.items(): + + if spec.type == 'bool': + default_module = 'bernoulli' + elif spec.type == 'int': + assert spec.num_values is not None + default_module = 'categorical' + elif spec.type == 'float': + if use_beta_distribution and spec.min_value is not None: + default_module = 'beta' + else: + default_module = 'gaussian' + + if distributions is None: + module = None + else: + module = dict() + if name is None and isinstance(distributions, str): + module = distributions + elif name in distributions: + if isinstance(distributions[name], str): + module = distributions[name] + else: + module.update(distributions[name]) + elif spec.type in distributions: + if isinstance(distributions[spec.type], str): + module = distributions[spec.type] + else: + module.update(distributions[spec.type]) + elif name is None and 'type' in distributions: + module.update(distributions) + + if name is None: + self.distributions[name] = self.submodule( + name='action_distribution', module=module, modules=distribution_modules, + default_module=default_module, action_spec=spec, + input_spec=output_spec.get('action-embedding', output_spec['embedding']) + ) + else: + self.distributions[name] = self.submodule( + name=(name + '_distribution'), module=module, modules=distribution_modules, + default_module=default_module, action_spec=spec, + input_spec=output_spec.get(name + '-embedding', output_spec['embedding']) + ) + + self.kldiv_reference_spec = self.distributions.fmap( + function=(lambda x: x.parameters_spec), cls=TensorsSpec + ) + + def get_architecture(self): + architecture = 'Network: {}'.format( + self.network.get_architecture().replace('\n', '\n ') + ) + if self.distributions.is_singleton(): + architecture += '\nAction-distribution:\n {}'.format( + self.distributions.singleton().get_architecture().replace('\n', '\n ') + ) + else: + architecture += '\nAction-distributions:' + for name, distribution in self.distributions.items(): + architecture += '\n {}:\n {}'.format( + name, distribution.get_architecture().replace('\n', '\n ') + ) + return architecture + + def input_signature(self, *, function): + try: + return StochasticPolicy.input_signature(self=self, function=function) + except NotImplementedError: + return ValuePolicy.input_signature(self=self, function=function) + + def output_signature(self, *, function): + try: + return StochasticPolicy.output_signature(self=self, function=function) + except NotImplementedError: + return ValuePolicy.output_signature(self=self, function=function) + + def get_savedmodel_trackables(self): + trackables = super().get_savedmodel_trackables() + for distribution in self.distributions.values(): + for variable in distribution.variables: + assert variable.name not in trackables + trackables[variable.name] = variable + return trackables + + @tf_function(num_args=5) + def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + assertions = list() + if self.config.create_tf_assertions: + if not independent: + false = tf_util.constant(value=False, dtype='bool') + assertions.append(tf.debugging.assert_equal(x=deterministic, y=false)) + + embedding, internals = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=independent + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def fn_mode(): + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.mode(parameters=parameters, independent=independent) + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + def fn_sample(): + if isinstance(self.temperature, dict): + + def function(name, distribution, temp): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.sample( + parameters=parameters, temperature=temp, independent=independent + ) + + temperature = self.temperature.fmap(function=(lambda x: x.value()), cls=TensorDict) + return self.distributions.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(temperature,) + ) + + else: + temperature = self.temperature.value() + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.sample( + parameters=parameters, temperature=temperature, independent=independent + ) + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + with tf.control_dependencies(control_inputs=assertions): + actions = tf.cond(pred=deterministic, true_fn=fn_mode, false_fn=fn_sample) + return actions, internals + + @tf_function(num_args=5) + def act_entropy(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + assertions = list() + if self.config.create_tf_assertions: + if not independent: + false = tf_util.constant(value=False, dtype='bool') + assertions.append(tf.debugging.assert_equal(x=deterministic, y=false)) + + embedding, internals = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=independent + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def fn_mode(): + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + mode = distribution.mode(parameters=parameters, independent=independent) + entropy = distribution.entropy(parameters=parameters) + return mode, entropy + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + def fn_sample(): + if isinstance(self.temperature, dict): + + def function(name, distribution, temp): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + sample = distribution.sample( + parameters=parameters, temperature=temp, independent=independent + ) + entropy = distribution.entropy(parameters=parameters) + return sample, entropy + + temperature = self.temperature.fmap(function=(lambda x: x.value()), cls=TensorDict) + return self.distributions.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(temperature,) + ) + + else: + temperature = self.temperature.value() + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + sample = distribution.sample( + parameters=parameters, temperature=temperature, independent=independent + ) + entropy = distribution.entropy(parameters=parameters) + return sample, entropy + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + with tf.control_dependencies(control_inputs=assertions): + actions, entropies = tf.cond(pred=deterministic, true_fn=fn_mode, false_fn=fn_sample) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + # See also implementation of StochasticPolicy.entropy() + entropies = entropies.fmap(function=function, zip_values=self.actions_spec) + entropies = tf.concat(values=tuple(entropies.values()), axis=1) + entropy = tf.math.reduce_mean(input_tensor=entropies, axis=1) + + return actions, internals, entropy + + @tf_function(num_args=5) + def log_probabilities(self, *, states, horizons, internals, auxiliaries, actions): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def function(name, distribution, action): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.log_probability(parameters=parameters, action=action) + + return self.distributions.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=actions + ) + + @tf_function(num_args=4) + def entropies(self, *, states, horizons, internals, auxiliaries): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.entropy(parameters=parameters) + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + @tf_function(num_args=5) + def kl_divergences(self, *, states, horizons, internals, auxiliaries, reference): + parameters = self.kldiv_reference( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + reference = reference.fmap(function=tf.stop_gradient) + + def function(distribution, parameters1, parameters2): + return distribution.kl_divergence(parameters1=parameters1, parameters2=parameters2) + + return self.distributions.fmap( + function=function, cls=TensorDict, zip_values=(parameters, reference) + ) + + @tf_function(num_args=4) + def kldiv_reference(self, *, states, horizons, internals, auxiliaries): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + return distribution.parametrize(x=x, conditions=conditions) + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) + + @tf_function(num_args=5) + def action_values(self, *, states, horizons, internals, auxiliaries, actions): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def function(name, distribution, action): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.action_value(parameters=parameters, action=action) + + return self.distributions.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=actions + ) + + @tf_function(num_args=4) + def state_values(self, *, states, horizons, internals, auxiliaries): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + def function(name, distribution): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + conditions = auxiliaries.get(name, default=TensorDict()) + parameters = distribution.parametrize(x=x, conditions=conditions) + return distribution.state_value(parameters=parameters) + + return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) diff --git a/tensorforce/core/policies/parametrized_policy.py b/tensorforce/core/policies/parametrized_policy.py new file mode 100644 index 000000000..b9bf528f7 --- /dev/null +++ b/tensorforce/core/policies/parametrized_policy.py @@ -0,0 +1,84 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import network_modules, TensorsSpec, tf_function +from tensorforce.core.policies import BasePolicy + + +class ParametrizedPolicy(BasePolicy): + """ + Base class for parametrized ("degenerate") policies. + """ + + # Network first + def __init__(self, network='auto', *, inputs_spec, outputs=None): + # Assumed to be secondary base class, so super() constructor has already been called + assert hasattr(self, 'name') + + # Network + if isinstance(network, tf.keras.Model) or ( + isinstance(network, type) and issubclass(network, tf.keras.Model) + ): + network = dict(type='keras', model=network) + self.network = self.submodule( + name='network', module=network, modules=network_modules, inputs_spec=inputs_spec, + outputs=outputs + ) + output_spec = self.network.output_spec() + if isinstance(output_spec, TensorsSpec): + for name, spec in output_spec.items(): + if spec.type != 'float': + raise TensorforceError.type( + name='ParametrizedPolicy', argument='network {} output'.format(name), + dtype=spec.type + ) + else: + if output_spec.type != 'float': + raise TensorforceError.type( + name='ParametrizedPolicy', argument='network output', dtype=output_spec.type + ) + + @property + def internals_spec(self): + return self.network.internals_spec + + def internals_init(self): + return self.network.internals_init() + + def max_past_horizon(self, *, on_policy): + return self.network.max_past_horizon(on_policy=on_policy) + + def get_savedmodel_trackables(self): + trackables = dict() + for variable in self.network.variables: + assert variable.name not in trackables + trackables[variable.name] = variable + return trackables + + @tf_function(num_args=0) + def past_horizon(self, *, on_policy): + return self.network.past_horizon(on_policy=on_policy) + + @tf_function(num_args=5) + def next_internals(self, *, states, horizons, internals, actions, deterministic, independent): + _, internals = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=independent + ) + + return internals diff --git a/tensorforce/core/policies/parametrized_state_value.py b/tensorforce/core/policies/parametrized_state_value.py new file mode 100644 index 000000000..404741674 --- /dev/null +++ b/tensorforce/core/policies/parametrized_state_value.py @@ -0,0 +1,73 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import layer_modules, tf_function, tf_util +from tensorforce.core.policies import ParametrizedPolicy, StateValue + + +class ParametrizedStateValue(StateValue, ParametrizedPolicy): + """ + Policy which parametrizes a state-value function, conditioned on the output of a neural network + processing the input state (specification key: `parametrized_state_value`). + + Args: + network ('auto' | specification): Policy network configuration, see + [networks](../modules/networks.html) + (default: 'auto', automatically configured + network). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + internals_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + # Network first + def __init__( + self, network='auto', *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, internals_spec=None, actions_spec=None + ): + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, states_spec=states_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + ParametrizedPolicy.__init__(self=self, network=network, inputs_spec=self.states_spec) + output_spec = self.network.output_spec() + + # State value + self.value = self.submodule( + name='value', module='linear', modules=layer_modules, size=0, input_spec=output_spec + ) + + def get_architecture(self): + return 'Network: {}\nState-value: {}'.format( + self.network.get_architecture().replace('\n', '\n '), + self.value.get_architecture().replace('\n', '\n ') + ) + + @tf_function(num_args=4) + def state_value(self, *, states, horizons, internals, auxiliaries): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + + return self.value.apply(x=embedding) diff --git a/tensorforce/core/policies/parametrized_value_policy.py b/tensorforce/core/policies/parametrized_value_policy.py new file mode 100644 index 000000000..5efcf1751 --- /dev/null +++ b/tensorforce/core/policies/parametrized_value_policy.py @@ -0,0 +1,593 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError +from tensorforce.core import layer_modules, ModuleDict, TensorDict, TensorSpec, TensorsSpec, \ + tf_function, tf_util +from tensorforce.core.policies import ParametrizedPolicy, ValuePolicy + + +class ParametrizedValuePolicy(ValuePolicy, ParametrizedPolicy): + """ + Policy which parametrizes independent action-/advantage-/state-value functions per action and + optionally a state-value function, conditioned on the output of a central neural network + processing the input state (specification key: `parametrized_value_policy`). + + Args: + network ('auto' | specification): Policy network configuration, see + [networks](../modules/networks.html) + (default: 'auto', automatically configured + network). + single_output (bool): Whether the network returns a single embedding tensor or, in the case + of multiple action components, specifies additional outputs for some/all action/state + value functions, via registered tensors with name "[ACTION]-embedding" or + "state-embedding"/"[ACTION]-state-embedding" depending on the state_value_mode argument + (default: single output). + state_value_mode ('implicit' | 'separate' | 'separate-per-action'): Whether to compute the + state value implicitly as maximum action value (like DQN), or as either a single + separate state-value function or a function per action (like DuelingDQN) + (default: single separate state-value + function). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + internals_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + # Network first + def __init__( + self, network='auto', *, single_output=True, state_value_mode='separate', device=None, + l2_regularization=None, name=None, states_spec=None, auxiliaries_spec=None, + internals_spec=None, actions_spec=None + ): + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, states_spec=states_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + if not all(spec.type in ('bool', 'int') for spec in self.actions_spec.values()): + raise TensorforceError.value( + name='ParametrizedValuePolicy', argument='actions_spec', value=actions_spec, + hint='types not bool/int' + ) + + # State value mode + if state_value_mode not in ('implicit', 'separate', 'separate-per-action'): + raise TensorforceError.value( + name='ParametrizedValuePolicy', argument='state_value_mode', value=state_value_mode, + hint='not from {implicit,separate,separate-per-action}' + ) + self.state_value_mode = state_value_mode + + if single_output: + outputs = None + elif self.actions_spec.is_singleton(): + if self.state_value_mode == 'implicit': + outputs = ('action-embedding',) + else: + outputs = ('action-embedding', 'state-embedding') + else: + outputs = tuple(name + '-embedding' for name in self.actions_spec) + if self.state_value_mode == 'separate': + outputs += ('state-embedding',) + elif self.state_value_mode == 'separate-per-action': + outputs += tuple(name + '-state-embedding' for name in self.actions_spec) + ParametrizedPolicy.__init__( + self=self, network=network, inputs_spec=self.states_spec, outputs=outputs + ) + output_spec = self.network.output_spec() + if not isinstance(output_spec, TensorsSpec): + output_spec = TensorsSpec(embedding=output_spec) + + # Action values + def function(name, spec): + if name is None: + input_name = 'action-embedding' + name = 'action_value' + else: + input_name = name + '-embedding' + name = name + '_action_value' + if spec.type == 'bool': + return self.submodule( + name=name, module='linear', modules=layer_modules, size=(spec.size * 2), + input_spec=output_spec.get(input_name, output_spec['embedding']) + ) + elif spec.type == 'int': + return self.submodule( + name=name, module='linear', modules=layer_modules, + size=(spec.size * spec.num_values), + input_spec=output_spec.get(input_name, output_spec['embedding']) + ) + + self.a_values = self.actions_spec.fmap(function=function, cls=ModuleDict, with_names=True) + + if self.state_value_mode == 'separate': + # State value + self.s_value = self.submodule( + name='value', module='linear', modules=layer_modules, size=0, + input_spec=output_spec.get('state-embedding', output_spec['embedding']) + ) + + elif self.state_value_mode == 'separate-per-action': + # State values per action + + def function(name, spec): + if name is None: + input_name = 'state-embedding' + name = 'state_value' + else: + input_name = name + '-state-embedding' + name = name + '_state_value' + return self.submodule( + name=name, module='linear', modules=layer_modules, size=spec.size, + input_spec=output_spec.get(input_name, output_spec['embedding']) + ) + + self.s_values = self.states_spec.fmap( + function=function, cls=ModuleDict, with_names=True + ) + + def get_architecture(self): + architecture = 'Network: {}'.format( + self.network.get_architecture().replace('\n', '\n ') + ) + if self.a_values.is_singleton(): + architecture += 'Action-value: {}'.format( + self.a_values.singleton().get_architecture().replace('\n', '\n ') + ) + else: + architecture += 'Action-values:' + for name, a_value in self.a_values.items(): + architecture += '\n {}: {}'.format( + name, a_value.get_architecture().replace('\n', '\n ') + ) + if self.state_value_mode == 'separate': + architecture += 'State-value: {}'.format( + self.s_value.get_architecture().replace('\n', '\n ') + ) + elif self.state_value_mode == 'separate-per-action': + if self.s_values.is_singleton(): + architecture += 'State-value: {}'.format( + self.s_values.singleton().get_architecture().replace('\n', '\n ') + ) + else: + architecture += 'State-values:' + for name, s_value in self.s_values.items(): + architecture += '\n {}: {}'.format( + name, s_value.get_architecture().replace('\n', '\n ') + ) + return architecture + + def initialize(self): + super().initialize() + + for name, spec in self.actions_spec.items(): + if spec.type == 'bool': + if name is None: + names = ['action-values/true', 'action-values/false'] + else: + names = ['action-values/' + name + '-true', 'action-values/' + name + '-false'] + spec = TensorSpec(type='float', shape=(spec.shape + (2,))) + else: + if name is None: + prefix = 'action-values/action' + else: + prefix = 'action-values/' + name + '-action' + names = [prefix + str(n) for n in range(spec.num_values)] + spec = TensorSpec(type='float', shape=(spec.shape + (spec.num_values,))) + + self.register_summary(label='action-value', name=names) + + if name is None: + name = 'action-values' + else: + name = name + '-values' + + self.register_tracking(label='action-value', name=name, spec=spec) + + def get_savedmodel_trackables(self): + trackables = super().get_savedmodel_trackables() + for a_value in self.a_values.values(): + for variable in a_value.variables: + assert variable.name not in trackables + trackables[variable.name] = variable + if self.state_value_mode == 'separate': + for variable in self.s_value.variables: + assert variable.name not in trackables + trackables[variable.name] = variable + elif self.state_value_mode == 'separate-per-action': + for s_value in self.s_values.values(): + for variable in s_value.variables: + assert variable.name not in trackables + trackables[variable.name] = variable + return trackables + + @tf_function(num_args=5) + def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + embedding, internals = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=independent + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + if self.state_value_mode == 'implicit': + + def function(name, spec, a_value): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + action_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + return tf.reshape(tensor=action_value, shape=shape) + + action_values = self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) + ) + + elif self.state_value_mode == 'separate': + state_value = self.s_value.apply( + x=embedding.get('state-embedding', embedding['embedding']) + ) + + def function(name, spec, a_value): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + advantage_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + advantage_value = tf.reshape(tensor=advantage_value, shape=shape) + mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) + shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) + _state_value = tf.reshape(tensor=state_value, shape=shape) + return _state_value + (advantage_value - mean) + + action_values = self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) + ) + + elif self.state_value_mode == 'separate-per-action': + + def function(name, spec, s_value, a_value): + if name is None: + state_value = s_value.apply( + x=embedding.get('state-embedding', embedding['embedding']) + ) + advantage_value = a_value.apply( + x=embedding.get('action-embedding', embedding['embedding']) + ) + else: + state_value = s_value.apply( + x=embedding.get(name + '-state-embedding', embedding['embedding']) + ) + advantage_value = a_value.apply( + x=embedding.get(name + '-embedding', embedding['embedding']) + ) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + advantage_value = tf.reshape(tensor=advantage_value, shape=shape) + mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) + return tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean) + + action_values = self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, + zip_values=(self.s_values, self.a_values) + ) + + def function(name, spec, action_value): + if spec.type == 'bool': + + def fn_summary(): + axis = range(spec.rank + 1) + values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) + return [values[0], values[1]] + + if name is None: + names = ['action-values/true', 'action-values/false'] + else: + names = ['action-values/' + name + '-true', 'action-values/' + name + '-false'] + dependencies = self.summary( + label='action-value', name=names, data=fn_summary, step='timesteps' + ) + + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=action_value, axis=0) + + if name is None: + n = 'action-values' + else: + n = name + '-values' + dependencies = self.track(label='action-value', name=n, data=fn_tracking) + + with tf.control_dependencies(control_inputs=dependencies): + return (action_value[..., 0] > action_value[..., 1]) + + elif spec.type == 'int': + + def fn_summary(): + axis = range(spec.rank + 1) + values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) + return [values[n] for n in range(spec.num_values)] + + if name is None: + prefix = 'action-values/action' + else: + prefix = 'action-values/' + name + '-action' + names = [prefix + str(n) for n in range(spec.num_values)] + dependencies = self.summary( + label='action-value', name=names, data=fn_summary, step='timesteps' + ) + + def fn_tracking(): + return tf.math.reduce_mean(input_tensor=action_value, axis=0) + + if name is None: + n = 'action-values' + else: + n = name + '-values' + dependencies = self.track(label='action-value', name=n, data=fn_tracking) + + with tf.control_dependencies(control_inputs=dependencies): + if self.config.enable_int_action_masking: + mask = auxiliaries[name]['mask'] + min_float = tf_util.get_dtype(type='float').min + min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) + action_value = tf.where(condition=mask, x=action_value, y=min_float) + return tf.math.argmax(input=action_value, axis=-1, output_type=spec.tf_type()) + + actions = self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(action_values,) + ) + + return actions, internals + + @tf_function(num_args=4) + def state_value(self, *, states, horizons, internals, auxiliaries): + if self.state_value_mode == 'separate': + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + return self.s_value.apply(x=embedding.get('state-embedding', embedding['embedding'])) + + else: + return super().state_value( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + + @tf_function(num_args=5) + def action_values(self, *, states, horizons, internals, auxiliaries, actions): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + if self.state_value_mode == 'implicit': + + def function(name, spec, a_value, action): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + action_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + action_value = tf.reshape(tensor=action_value, shape=shape) + if spec.type == 'bool': + return tf.where( + condition=action, x=action_value[..., 0], y=action_value[..., 1] + ) + elif spec.type == 'int': + action = tf.expand_dims(input=action, axis=(spec.rank + 1)) + action_value = tf.gather( + params=action_value, indices=action, batch_dims=(spec.rank + 1) + ) + return tf.squeeze(input=action_value, axis=(spec.rank + 1)) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, + zip_values=(self.a_values, actions) + ) + + elif self.state_value_mode == 'separate': + state_value = self.s_value.apply( + x=embedding.get('state-embedding', embedding['embedding']) + ) + + def function(name, spec, a_value, action): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + advantage_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + advantage_value = tf.reshape(tensor=advantage_value, shape=shape) + mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) + shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) + _state_value = tf.reshape(tensor=state_value, shape=shape) + action_value = _state_value + (advantage_value - mean) + if spec.type == 'bool': + return tf.where( + condition=action, x=action_value[..., 0], y=action_value[..., 1] + ) + elif spec.type == 'int': + action = tf.expand_dims(input=action, axis=(spec.rank + 1)) + action_value = tf.gather( + params=action_value, indices=action, batch_dims=(spec.rank + 1) + ) + return tf.squeeze(input=action_value, axis=(spec.rank + 1)) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, + zip_values=(self.a_values, actions) + ) + + elif self.state_value_mode == 'separate-per-action': + + def function(name, spec, s_value, a_value, action): + if name is None: + state_value = s_value.apply( + x=embedding.get('state-embedding', embedding['embedding']) + ) + advantage_value = a_value.apply( + x=embedding.get('action-embedding', embedding['embedding']) + ) + else: + state_value = s_value.apply( + x=embedding.get(name + '-state-embedding', embedding['embedding']) + ) + advantage_value = a_value.apply( + x=embedding.get(name + '-embedding', embedding['embedding']) + ) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + advantage_value = tf.reshape(tensor=advantage_value, shape=shape) + mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) + action_value = tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean) + if spec.type == 'bool': + return tf.where( + condition=action, x=action_value[..., 0], y=action_value[..., 1] + ) + elif spec.type == 'int': + action = tf.expand_dims(input=action, axis=(spec.rank + 1)) + action_value = tf.gather( + params=action_value, indices=action, batch_dims=(spec.rank + 1) + ) + return tf.squeeze(input=action_value, axis=(spec.rank + 1)) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, + zip_values=(self.s_values, self.a_values, actions) + ) + + @tf_function(num_args=4) + def state_values(self, *, states, horizons, internals, auxiliaries): + deterministic = tf_util.constant(value=True, dtype='bool') + embedding, _ = self.network.apply( + x=states, horizons=horizons, internals=internals, deterministic=deterministic, + independent=True + ) + if not isinstance(embedding, TensorDict): + embedding = TensorDict(embedding=embedding) + + if self.state_value_mode == 'implicit': + + def function(name, spec, a_value): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + action_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + action_value = tf.reshape(tensor=action_value, shape=shape) + if spec.type == 'bool': + return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) + elif spec.type == 'int': + if self.config.enable_int_action_masking: + mask = auxiliaries[name]['mask'] + min_float = tf_util.get_dtype(type='float').min + min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) + action_value = tf.where(condition=mask, x=action_value, y=min_float) + return tf.math.reduce_max(input_tensor=action_value, axis=-1) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) + ) + + elif self.state_value_mode == 'separate': + state_value = self.s_value.apply( + x=embedding.get('state-embedding', embedding['embedding']) + ) + + def function(name, spec, a_value): + if name is None: + x = embedding.get('action-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-embedding', embedding['embedding']) + advantage_value = a_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + (2,) + elif spec.type == 'int': + shape = (-1,) + spec.shape + (spec.num_values,) + advantage_value = tf.reshape(tensor=advantage_value, shape=shape) + mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) + shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) + _state_value = tf.reshape(tensor=state_value, shape=shape) + action_value = _state_value + (advantage_value - mean) + if spec.type == 'bool': + return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) + elif spec.type == 'int': + if self.config.enable_int_action_masking: + mask = auxiliaries[name]['mask'] + min_float = tf_util.get_dtype(type='float').min + min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) + action_value = tf.where(condition=mask, x=action_value, y=min_float) + return tf.math.reduce_max(input_tensor=action_value, axis=-1) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) + ) + + elif self.state_value_mode == 'separate-per-action': + + def function(name, spec, s_value): + if name is None: + x = embedding.get('state-embedding', embedding['embedding']) + else: + x = embedding.get(name + '-state-embedding', embedding['embedding']) + state_value = s_value.apply(x=x) + if spec.type == 'bool': + shape = (-1,) + spec.shape + elif spec.type == 'int': + shape = (-1,) + spec.shape + return tf.reshape(tensor=state_value, shape=shape) + + return self.actions_spec.fmap( + function=function, cls=TensorDict, zip_values=(self.s_values,) + ) diff --git a/tensorforce/core/policies/policy.py b/tensorforce/core/policies/policy.py new file mode 100644 index 000000000..cc718570e --- /dev/null +++ b/tensorforce/core/policies/policy.py @@ -0,0 +1,63 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import SignatureDict, TensorSpec, tf_function +from tensorforce.core.policies import BasePolicy + + +class Policy(BasePolicy): + """ + Base class for decision policies. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + def get_architecture(self): + raise NotImplementedError + + def input_signature(self, *, function): + if function == 'act': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'act': + return SignatureDict( + actions=self.actions_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=5) + def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + raise NotImplementedError diff --git a/tensorforce/core/policies/state_value.py b/tensorforce/core/policies/state_value.py new file mode 100644 index 000000000..3f2776d33 --- /dev/null +++ b/tensorforce/core/policies/state_value.py @@ -0,0 +1,67 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.core import SignatureDict, TensorSpec, tf_function +from tensorforce.core.policies import BasePolicy + + +class StateValue(BasePolicy): + """ + Base class for state-value functions, here categorized as "degenerate" policy. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + def __init__( + self, *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, actions_spec=None + ): + BasePolicy.__init__( + self=self, device=device, l2_regularization=l2_regularization, name=name, + states_spec=states_spec, auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + def input_signature(self, *, function): + if function == 'state_value': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'state_value': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=4) + def state_value(self, *, states, horizons, internals, auxiliaries): + raise NotImplementedError diff --git a/tensorforce/core/policies/stochastic_policy.py b/tensorforce/core/policies/stochastic_policy.py new file mode 100644 index 000000000..1a77a0a0c --- /dev/null +++ b/tensorforce/core/policies/stochastic_policy.py @@ -0,0 +1,272 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import ModuleDict, parameter_modules, SignatureDict, TensorDict, TensorSpec, \ + TensorsSpec, tf_function, tf_util +from tensorforce.core.policies import Policy + + +class StochasticPolicy(Policy): + """ + Base class for stochastic policies. + + Args: + temperature (parameter | dict[parameter], float >= 0.0): Sampling temperature, global or + per action (default: 1.0). + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + kldiv_reference_spec (specification): + internal use. + """ + + def __init__( + self, *, temperature=1.0, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, internals_spec=None, actions_spec=None, kldiv_reference_spec=None + ): + super().__init__( + device=device, l2_regularization=l2_regularization, name=name, states_spec=states_spec, + auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + self.kldiv_reference_spec = kldiv_reference_spec + + # Sampling temperature + if temperature is None: + temperature = 1.0 + if isinstance(temperature, dict) and all(name in self.actions_spec for name in temperature): + # Different temperature per action + + def function(name, spec): + return self.submodule( + name=(name + '_temperature'), module=temperature.get(name, 0.0), + modules=parameter_modules, is_trainable=False, dtype='float', min_value=0.0 + ) + + self.temperature = self.actions_spec.fmap( + function=function, cls=ModuleDict, with_names=True + ) + + else: + # Same temperature for all actions + self.temperature = self.submodule( + name='temperature', module=temperature, modules=parameter_modules, + is_trainable=False, dtype='float', min_value=0.0 + ) + + def input_signature(self, *, function): + if function == 'act_entropy': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + deterministic=TensorSpec(type='bool', shape=()).signature(batched=False) + ) + + elif function == 'entropy': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + elif function == 'entropies': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + elif function == 'kl_divergence': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + reference=self.distributions.fmap( + function=(lambda x: x.parameters_spec), cls=TensorsSpec + ).signature(batched=True) + ) + + elif function == 'kl_divergences': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + reference=self.distributions.fmap( + function=(lambda x: x.parameters_spec), cls=TensorsSpec + ).signature(batched=True) + ) + + elif function == 'kldiv_reference': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + elif function == 'log_probability': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + elif function == 'log_probabilities': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + else: + return super().input_signature(function=function) + + def output_signature(self, *, function): + if function == 'act_entropy': + return SignatureDict( + actions=self.actions_spec.signature(batched=True), + internals=self.internals_spec.signature(batched=True), + entropy=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + elif function == 'entropy': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + elif function == 'entropies': + return SignatureDict( + singleton=self.actions_spec.fmap(function=( + lambda spec: TensorSpec(type='float', shape=spec.shape).signature(batched=True) + ), cls=SignatureDict) + ) + + elif function == 'kl_divergence': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + elif function == 'kl_divergences': + return SignatureDict( + singleton=self.actions_spec.fmap(function=( + lambda spec: TensorSpec(type='float', shape=spec.shape).signature(batched=True) + ), cls=SignatureDict) + ) + + elif function == 'kldiv_reference': + return SignatureDict( + singleton=self.kldiv_reference_spec.signature(batched=True) + ) + + elif function == 'log_probability': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + elif function == 'log_probabilities': + return SignatureDict( + singleton=self.actions_spec.fmap(function=( + lambda spec: TensorSpec(type='float', shape=spec.shape).signature(batched=True) + ), cls=SignatureDict) + ) + + else: + return super().output_signature(function=function) + + @tf_function(num_args=5) + def log_probability(self, *, states, horizons, internals, auxiliaries, actions): + log_probabilities = self.log_probabilities( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + log_probabilities = log_probabilities.fmap(function=function, zip_values=self.actions_spec) + log_probabilities = tf.concat(values=tuple(log_probabilities.values()), axis=1) + + return tf.math.reduce_sum(input_tensor=log_probabilities, axis=1) + + @tf_function(num_args=4) + def entropy(self, *, states, horizons, internals, auxiliaries): + entropies = self.entropies( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + # See also implementation of ParametrizedDistributions.act_entropy() + entropies = entropies.fmap(function=function, zip_values=self.actions_spec) + entropies = tf.concat(values=tuple(entropies.values()), axis=1) + + return tf.math.reduce_mean(input_tensor=entropies, axis=1) + + @tf_function(num_args=5) + def kl_divergence(self, *, states, horizons, internals, auxiliaries, reference): + kl_divergences = self.kl_divergences( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + reference=reference + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + kl_divergences = kl_divergences.fmap(function=function, zip_values=self.actions_spec) + kl_divergences = tf.concat(values=tuple(kl_divergences.values()), axis=1) + + return tf.math.reduce_mean(input_tensor=kl_divergences, axis=1) + + @tf_function(num_args=5) + def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + raise NotImplementedError + + @tf_function(num_args=5) + def act_entropy(self, *, states, horizons, internals, auxiliaries, deterministic, independent): + raise NotImplementedError + + @tf_function(num_args=4) + def entropies(self, *, states, horizons, internals, auxiliaries): + raise NotImplementedError + + @tf_function(num_args=5) + def kl_divergences(self, *, states, horizons, internals, auxiliaries, reference): + raise NotImplementedError + + @tf_function(num_args=4) + def kldiv_reference(self, *, states, horizons, internals, auxiliaries): + raise NotImplementedError + + @tf_function(num_args=5) + def log_probabilities(self, *, states, horizons, internals, auxiliaries, actions): + raise NotImplementedError diff --git a/tensorforce/core/policies/value_policy.py b/tensorforce/core/policies/value_policy.py new file mode 100644 index 000000000..43b4a87a2 --- /dev/null +++ b/tensorforce/core/policies/value_policy.py @@ -0,0 +1,146 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce.core import SignatureDict, TensorSpec, tf_function +from tensorforce.core.policies import ActionValue, Policy, StateValue + + +class ValuePolicy(Policy, StateValue, ActionValue): + """ + Base class for value-based policies. + + Args: + device (string): Device name + (default: inherit value of parent module). + l2_regularization (float >= 0.0): Scalar controlling L2 regularization + (default: inherit value of parent module). + name (string): internal use. + states_spec (specification): internal use. + auxiliaries_spec (specification): internal use. + actions_spec (specification): internal use. + """ + + def __init__( + self, *, device=None, l2_regularization=None, name=None, states_spec=None, + auxiliaries_spec=None, actions_spec=None + ): + Policy.__init__( + self=self, device=device, l2_regularization=l2_regularization, name=name, + states_spec=states_spec, auxiliaries_spec=auxiliaries_spec, actions_spec=actions_spec + ) + + def input_signature(self, *, function): + if function == 'action_value': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + elif function == 'action_values': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True), + actions=self.actions_spec.signature(batched=True) + ) + + elif function == 'state_values': + return SignatureDict( + states=self.states_spec.signature(batched=True), + horizons=TensorSpec(type='int', shape=(2,)).signature(batched=True), + internals=self.internals_spec.signature(batched=True), + auxiliaries=self.auxiliaries_spec.signature(batched=True) + ) + + else: + try: + return Policy.input_signature(self=self, function=function) + except NotImplementedError: + try: + return StateValue.input_signature(self=self, function=function) + except NotImplementedError: + return ActionValue.input_signature(self=self, function=function) + + def output_signature(self, *, function): + if function == 'action_value': + return SignatureDict( + singleton=TensorSpec(type='float', shape=()).signature(batched=True) + ) + + if function == 'action_values': + return SignatureDict( + singleton=self.actions_spec.fmap(function=( + lambda spec: TensorSpec(type='float', shape=spec.shape).signature(batched=True) + ), cls=SignatureDict) + ) + + elif function == 'state_values': + return SignatureDict( + singleton=self.actions_spec.fmap(function=( + lambda spec: TensorSpec(type='float', shape=spec.shape).signature(batched=True) + ), cls=SignatureDict) + ) + + else: + try: + return Policy.output_signature(self=self, function=function) + except NotImplementedError: + try: + return StateValue.output_signature(self=self, function=function) + except NotImplementedError: + return ActionValue.output_signature(self=self, function=function) + + @tf_function(num_args=5) + def action_value(self, *, states, horizons, internals, auxiliaries, actions): + action_values = self.action_values( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, + actions=actions + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + action_values = action_values.fmap(function=function, zip_values=self.actions_spec) + action_values = tf.concat(values=tuple(action_values.values()), axis=1) + + return tf.math.reduce_mean(input_tensor=action_values, axis=1) + + @tf_function(num_args=4) + def state_value(self, *, states, horizons, internals, auxiliaries): + state_values = self.state_values( + states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries + ) + + def function(value, spec): + return tf.reshape(tensor=value, shape=(-1, spec.size)) + + state_values = state_values.fmap(function=function, zip_values=self.actions_spec) + state_values = tf.concat(values=tuple(state_values.values()), axis=1) + + return tf.math.reduce_mean(input_tensor=state_values, axis=1) + + @tf_function(num_args=5) + def action_values(self, *, states, horizons, internals, auxiliaries, actions): + raise NotImplementedError + + @tf_function(num_args=4) + def state_values(self, *, states, horizons, internals, auxiliaries): + raise NotImplementedError diff --git a/tensorforce/core/preprocessors/__init__.py b/tensorforce/core/preprocessors/__init__.py deleted file mode 100644 index 2c2bbcf79..000000000 --- a/tensorforce/core/preprocessors/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorforce.core.preprocessors.preprocessor import Preprocessor, PreprocessorStack -from tensorforce.core.preprocessors.sequence import Sequence -from tensorforce.core.preprocessors.standardize import Standardize -from tensorforce.core.preprocessors.running_standardize import RunningStandardize -from tensorforce.core.preprocessors.normalize import Normalize -from tensorforce.core.preprocessors.grayscale import Grayscale -from tensorforce.core.preprocessors.image_resize import ImageResize -from tensorforce.core.preprocessors.divide import Divide -from tensorforce.core.preprocessors.clip import Clip -from tensorforce.core.preprocessors.flatten import Flatten - - -preprocessors = dict( - sequence=Sequence, - standardize=Standardize, - running_standardize=RunningStandardize, - normalize=Normalize, - grayscale=Grayscale, - image_resize=ImageResize, - divide=Divide, - clip=Clip, - flatten=Flatten -) - - -__all__ = [ - 'preprocessors', - 'Preprocessor', - 'PreprocessorStack', - 'Sequence', - 'Standardize', - 'RunningStandardize', - 'Normalize', - 'Grayscale', - 'ImageResize', - 'Divide', - 'Clip', - 'Flatten' -] diff --git a/tensorforce/core/preprocessors/clip.py b/tensorforce/core/preprocessors/clip.py deleted file mode 100644 index bbcc845bc..000000000 --- a/tensorforce/core/preprocessors/clip.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorforce.core.preprocessors import Preprocessor - - -class Clip(Preprocessor): - """ - Clip by min/max. - """ - - def __init__(self, shape, min_value, max_value, scope='clip', summary_labels=()): - self.min_value = min_value - self.max_value = max_value - super(Clip, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - return tf.clip_by_value(t=tensor, clip_value_min=self.min_value, clip_value_max=self.max_value) diff --git a/tensorforce/core/preprocessors/divide.py b/tensorforce/core/preprocessors/divide.py deleted file mode 100644 index 0db980709..000000000 --- a/tensorforce/core/preprocessors/divide.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorforce.core.preprocessors import Preprocessor - - -class Divide(Preprocessor): - """ - Divide state by scale. - """ - - def __init__(self, shape, scale, scope='divide', summary_labels=()): - self.scale = scale - super(Divide, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - return tensor / self.scale diff --git a/tensorforce/core/preprocessors/flatten.py b/tensorforce/core/preprocessors/flatten.py deleted file mode 100644 index 849b7c5fb..000000000 --- a/tensorforce/core/preprocessors/flatten.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class Flatten(Preprocessor): - """ - Normalize state. Subtract minimal value and divide by range. - """ - - def __init__(self, scope='flatten', summary_labels=()): - super(Flatten, self).__init__(scope=scope, summary_labels=summary_labels) - - def processed_shape(self, shape): - return -1, util.prod(shape[1:]) - - def tf_process(self, tensor): - # Flatten tensor - return tf.reshape(tensor=tensor, shape=self.processed_shape(util.shape(tensor))) diff --git a/tensorforce/core/preprocessors/grayscale.py b/tensorforce/core/preprocessors/grayscale.py deleted file mode 100644 index 79d9c804c..000000000 --- a/tensorforce/core/preprocessors/grayscale.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class Grayscale(Preprocessor): - """ - Turn 3D color state into grayscale. - """ - - def __init__(self, shape, weights=(0.299, 0.587, 0.114), remove_rank=False, scope='grayscale', summary_labels=()): - """ - Args: - weights (tuple): The weights to multiply each color channel with (in order: red, blue, green). - remove_rank (bool): If True, will remove the color channel rank from the input tensor. - """ - self.weights = weights - self.remove_rank = remove_rank - super(Grayscale, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - weights = tf.reshape(tensor=self.weights, shape=(tuple(1 for _ in range(util.rank(tensor) - 1)) + (3,))) - weighted_sum = tf.reduce_sum(input_tensor=(weights * tensor), axis=-1, keepdims=(not self.remove_rank)) - return weighted_sum - - def processed_shape(self, shape): - return tuple(shape[:-1]) + ((1,) if not self.remove_rank else ()) diff --git a/tensorforce/core/preprocessors/image_resize.py b/tensorforce/core/preprocessors/image_resize.py deleted file mode 100644 index ebf53df77..000000000 --- a/tensorforce/core/preprocessors/image_resize.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorforce.core.preprocessors import Preprocessor - - -class ImageResize(Preprocessor): - """ - Resize image to width x height. - """ - - def __init__(self, shape, width, height, scope='image_resize', summary_labels=()): - self.size = (width, height) - super(ImageResize, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - return tf.image.resize_images(images=tensor, size=self.size) - - def processed_shape(self, shape): - return self.size + (shape[-1],) diff --git a/tensorforce/core/preprocessors/normalize.py b/tensorforce/core/preprocessors/normalize.py deleted file mode 100644 index 2261ac915..000000000 --- a/tensorforce/core/preprocessors/normalize.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class Normalize(Preprocessor): - """ - Normalize state. Subtract minimal value and divide by range. - """ - - def __init__(self, shape, scope='normalize', summary_labels=()): - super(Normalize, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - # Min/max across every axis except batch dimension. - min_value = tensor - max_value = tensor - for axis in range(1, util.rank(tensor)): - min_value = tf.reduce_min(input_tensor=min_value, axis=axis, keep_dims=True) - max_value = tf.reduce_max(input_tensor=max_value, axis=axis, keep_dims=True) - - return (tensor - min_value) / (max_value - min_value + util.epsilon) diff --git a/tensorforce/core/preprocessors/preprocessor.py b/tensorforce/core/preprocessors/preprocessor.py deleted file mode 100644 index 962a4507e..000000000 --- a/tensorforce/core/preprocessors/preprocessor.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf -import copy - -from tensorforce import util -import tensorforce.core.preprocessors - - -class Preprocessor(object): - """ - A Preprocessor is an object used to map input state signals to some RL-model - to some "preprocessed state signals". For example: If the input state is an RGB image of 84x84px (3 color - channels; 84x84x3 tensor), a preprocessor could make this image a grayscale 84x84x1 tensor, instead. - - Each preprocessor is fully integrated into the model's graph, has its own scope and owns some - variables that live under that scope in the graph. - """ - - def __init__(self, shape, scope='preprocessor', summary_labels=None): - self.shape = shape - self.summary_labels = set(summary_labels or ()) - self.variables = dict() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - variable = getter(name=name, registered=True, **kwargs) - if not registered: - self.variables[name] = variable - return variable - - self.process = tf.make_template( - name_=(scope + '/process'), - func_=self.tf_process, - custom_getter_=custom_getter - ) - self.reset = tf.make_template( - name_=(scope + '/reset'), - func_=self.tf_reset, - custom_getter_=custom_getter - ) - - def tf_reset(self): - """ - Resets this preprocessor to some initial state. This method is called whenever an episode ends. - This could be useful if the preprocessor stores certain episode-sequence information to do the processing - and this information has to be reset after the episode terminates. - """ - pass - - def tf_process(self, tensor): - """ - Process state (tensor). - - Args: - tensor (tf.Tensor): The Tensor to process. - - Returns: The pre-processed Tensor. - """ - return tensor - - def processed_shape(self, shape): - """ - Shape of preprocessed state given original shape. - - Args: - shape (tuple): The original (unprocessed) shape. - - Returns: The processed tensor shape. - """ - return shape - - def get_variables(self): - """ - Returns the TensorFlow variables used by the preprocessor. - - Returns: - List of variables. - """ - return [self.variables[key] for key in sorted(self.variables)] - - -class PreprocessorStack(object): - """ - A class to handle many Preprocessor objects applied in a sequence to some state. For example: An image - tensor as state signal could be re-sized first, then grayscaled, then normalized. - """ - - def __init__(self): - self.preprocessors = list() - - def reset(self): - """ - Calls `reset` on all our Preprocessor objects. - - Returns: - A list of tensors to be fetched. - """ - fetches = [] - for processor in self.preprocessors: - fetches.extend(processor.reset() or []) - return fetches - - def process(self, tensor): - """ - Process state. - - Args: - tensor: tensor to process - - Returns: processed state - - """ - for processor in self.preprocessors: - tensor = processor.process(tensor=tensor) - return tensor - - def processed_shape(self, shape): - """ - Shape of preprocessed state given original shape. - - Args: - shape: original state shape - - Returns: processed state shape - """ - for processor in self.preprocessors: - shape = processor.processed_shape(shape=shape) - return shape - - def get_variables(self): - return [variable for preprocessor in self.preprocessors for variable in preprocessor.get_variables()] - - @staticmethod - def from_spec(spec, kwargs=None): - """ - Creates a preprocessing stack from a specification dict. - """ - if isinstance(spec, dict): - spec = [spec] - - stack = PreprocessorStack() - for preprocessor_spec in spec: - # need to deep copy, otherwise will add first processors spec_ to kwargs to second processor - preprocessor_kwargs = copy.deepcopy(kwargs) - preprocessor = util.get_object( - obj=preprocessor_spec, - predefined_objects=tensorforce.core.preprocessors.preprocessors, - kwargs=preprocessor_kwargs - ) - assert isinstance(preprocessor, Preprocessor) - stack.preprocessors.append(preprocessor) - - return stack diff --git a/tensorforce/core/preprocessors/running_standardize.py b/tensorforce/core/preprocessors/running_standardize.py deleted file mode 100644 index 35b1c85e8..000000000 --- a/tensorforce/core/preprocessors/running_standardize.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class RunningStandardize(Preprocessor): - """ - Standardize state w.r.t past states. - Subtract mean and divide by standard deviation of sequence of past states. - Based on https://www.johndcook.com/blog/standard_deviation/. - """ - - def __init__( - self, - shape, - reset_after_batch=True, - scope='running_standardize', - summary_labels=() - ): - self.reset_after_batch = reset_after_batch - # The op that resets our stats variables. - self.reset_op = None - super(RunningStandardize, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_reset(self): - if self.reset_after_batch: - return [self.reset_op] - - def tf_process(self, tensor): - count = tf.get_variable( - name='count', - dtype=util.tf_dtype('float'), - initializer=0.0, - trainable=False - ) - mean_estimate = tf.get_variable( - name='mean-estimate', - shape=self.shape, - dtype=util.tf_dtype('float'), - initializer=tf.zeros_initializer(), - trainable=False - ) - variance_sum_estimate = tf.get_variable( - name='variance-sum-estimate', - shape=self.shape, - dtype=util.tf_dtype('float'), - initializer=tf.zeros_initializer(), - trainable=False - ) - self.reset_op = tf.variables_initializer([count, mean_estimate, variance_sum_estimate], name='reset-op') - - assignment = tf.assign_add(ref=count, value=1.0) - - with tf.control_dependencies(control_inputs=(assignment,)): - # Mean update - mean = tf.reduce_sum(input_tensor=(tensor - mean_estimate), axis=0) # reduce_mean? - assignment = tf.assign_add(ref=mean_estimate, value=(mean / count)) - - with tf.control_dependencies(control_inputs=(assignment,)): - - def first_run(): - # No meaningful mean and variance yet. - return tensor - - def later_run(): - # Variance update - variance = tf.reduce_sum(input_tensor=((tensor - mean_estimate) * mean), axis=0) # reduce_mean? - assignment = tf.assign_add(ref=variance_sum_estimate, value=variance) - with tf.control_dependencies(control_inputs=(assignment,)): - variance_estimate = variance_sum_estimate / (count - 1.0) - # Standardize tensor - return (tensor - mean_estimate) / tf.maximum(x=tf.sqrt(x=variance_estimate), y=util.epsilon) - - return tf.cond(pred=(count > 1.0), true_fn=later_run, false_fn=first_run) diff --git a/tensorforce/core/preprocessors/sequence.py b/tensorforce/core/preprocessors/sequence.py deleted file mode 100755 index 17989c28f..000000000 --- a/tensorforce/core/preprocessors/sequence.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class Sequence(Preprocessor): - """ - Concatenate `length` state vectors. Example: Used in Atari - problems to create the Markov property (velocity of game objects as they move across the screen). - """ - - def __init__(self, shape, length=2, add_rank=False, scope='sequence', summary_labels=()): - """ - Args: - length (int): The number of states to concatenate. In the beginning, when no previous state is available, - concatenate the given first state with itself `length` times. - add_rank (bool): Whether to add another rank to the end of the input with dim=length-of-the-sequence. - This could be useful if e.g. a grayscale image of w x h pixels is coming from the env - (no color channel). The output of the preprocessor would then be of shape [batch] x w x h x [length]. - """ - # raise TensorForceError("The sequence preprocessor is temporarily broken; use version 0.3.2 if required.") - self.length = length - self.add_rank = add_rank - # The op that resets index back to -1. - self.reset_op = None - super(Sequence, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_reset(self): - return [self.reset_op] - - def tf_process(self, tensor): - # or just always the same? - tf.assert_equal(x=tf.shape(input=tensor)[0], y=1) - - states_buffer = tf.get_variable( - name='states-buffer', - shape=((self.length,) + util.shape(tensor)[1:]), - dtype=tensor.dtype, - trainable=False - ) - index = tf.get_variable( - name='index', - dtype=util.tf_dtype('int'), - initializer=-1, - trainable=False - ) - self.reset_op = tf.variables_initializer([index], name='reset-op') - - def first_run(): - fill_buffer = (self.length,) + tuple(1 for _ in range(util.rank(tensor) - 1)) - return tf.assign(ref=states_buffer, value=tf.tile(input=tensor, multiples=fill_buffer)) - - def later_run(): - return tf.assign(ref=states_buffer[index], value=tensor[0]) - - assignment = tf.cond(pred=(index >= 0), true_fn=later_run, false_fn=first_run) - - with tf.control_dependencies(control_inputs=(assignment,)): - previous_states = [states_buffer[(index - n - 1) % self.length] for n in range(self.length)] - assignment = tf.assign(ref=index, value=((tf.maximum(x=index, y=0) + 1) % self.length)) - - with tf.control_dependencies(control_inputs=(assignment,)): - if self.add_rank: - stack = tf.stack(values=previous_states, axis=-1) - else: - stack = tf.concat(values=previous_states, axis=-1) - batch_one = tf.expand_dims(input=stack, axis=0) - return batch_one - - def processed_shape(self, shape): - if self.add_rank: - return shape + (self.length,) - else: - return shape[:-1] + (shape[-1] * self.length,) diff --git a/tensorforce/core/preprocessors/standardize.py b/tensorforce/core/preprocessors/standardize.py deleted file mode 100644 index b193f1297..000000000 --- a/tensorforce/core/preprocessors/standardize.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.preprocessors import Preprocessor - - -class Standardize(Preprocessor): - """ - Standardize state. Subtract mean and divide by standard deviation. - """ - - def __init__( - self, - shape, - across_batch=False, - scope='standardize', - summary_labels=() - ): - self.across_batch = across_batch - super(Standardize, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) - - def tf_process(self, tensor): - if self.across_batch: - axes = tuple(range(util.rank(tensor))) - else: - axes = tuple(range(1, util.rank(tensor))) - - mean, variance = tf.nn.moments(x=tensor, axes=axes, keep_dims=True) - return (tensor - mean) / tf.maximum(x=tf.sqrt(variance), y=util.epsilon) diff --git a/tensorforce/core/baselines/__init__.py b/tensorforce/core/utils/__init__.py old mode 100755 new mode 100644 similarity index 50% rename from tensorforce/core/baselines/__init__.py rename to tensorforce/core/utils/__init__.py index ae6cff785..38152b81e --- a/tensorforce/core/baselines/__init__.py +++ b/tensorforce/core/utils/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,26 +13,19 @@ # limitations under the License. # ============================================================================== -from tensorforce.core.baselines.baseline import Baseline -from tensorforce.core.baselines.aggregated_baseline import AggregatedBaseline -from tensorforce.core.baselines.network_baseline import NetworkBaseline -from tensorforce.core.baselines.mlp_baseline import MLPBaseline -from tensorforce.core.baselines.cnn_baseline import CNNBaseline +from tensorforce.core.utils import tf_util +from tensorforce.core.utils.nested_dict import NestedDict +from tensorforce.core.utils.tensor_spec import TensorSpec +# Requires NestedDict +from tensorforce.core.utils.dicts import ArrayDict, ListDict, ModuleDict, SignatureDict, \ + TensorDict, VariableDict -baselines = dict( - aggregated=AggregatedBaseline, - custom=NetworkBaseline, - mlp=MLPBaseline, - cnn=CNNBaseline -) +# Requires TensorsDict (and TensorSpec) +from tensorforce.core.utils.tensors_spec import TensorsSpec __all__ = [ - 'baselines', - 'Baseline', - 'AggregatedBaseline', - 'NetworkBaseline', - 'MLPBaseline', - 'CNNBaseline' + 'ArrayDict', 'ListDict', 'ModuleDict', 'NestedDict', 'SignatureDict', 'TensorDict', + 'TensorSpec', 'TensorsSpec', 'tf_util', 'VariableDict' ] diff --git a/tensorforce/core/utils/dicts.py b/tensorforce/core/utils/dicts.py new file mode 100644 index 000000000..7dab1f844 --- /dev/null +++ b/tensorforce/core/utils/dicts.py @@ -0,0 +1,316 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +import numpy as np +import tensorflow as tf +from tensorflow.python.training.tracking.tracking import AutoTrackable +from tensorflow.python.training.tracking.data_structures import sticky_attribute_assignment + +from tensorforce.core.utils import NestedDict + + +class TrackableNestedDict(NestedDict, AutoTrackable): + + def __init__(self, arg=None, *, value_type=None, overwrite=None, **kwargs): + self._maybe_initialize_trackable() + super().__init__( + arg=arg, value_type=value_type, overwrite=overwrite, singleton=None, **kwargs + ) + + def __setattr__(self, name, value): + super(NestedDict, self).__setattr__(name, value) + + def __setitem__(self, key, value): + if key is None: + value = sticky_attribute_assignment( + trackable=self, value=value, name=self.__class__._SINGLETON + ) + else: + value = sticky_attribute_assignment(trackable=self, value=value, name=key) + super().__setitem__(key, value) + + # def __iter__(self): + # for name in super().__iter__(): + # if name is None: + # yield + # else: + # yield name + + # def items(self): + # for name, value in super().items(): + # if name is None: + # yield self.__class__._SINGLETON, value + # else: + # yield name, value + + +class ArrayDict(NestedDict): + + def __init__(self, *args, singleton=None, **kwargs): + super().__init__( + *args, value_type=np.ndarray, overwrite=False, singleton=singleton, **kwargs + ) + + def __setitem__(self, key, value): + if not isinstance(value, dict): + value = np.asarray(value) + super().__setitem__(key, value) + + def to_dict(self): + if self.is_singleton(): + value = self.singleton() + if isinstance(value, self.value_type): + return value + else: + return value.to_dict() + else: + return OrderedDict(( + (name, arg) if isinstance(arg, self.value_type) else (name, arg.to_dict()) + for name, arg in super(NestedDict, self).items() + )) + + def to_kwargs(self): + if self.is_singleton(): + value = self.singleton() + if isinstance(value, self.value_type): + return value + else: + return value.to_kwargs() + else: + return OrderedDict(((name, arg) for name, arg in super(NestedDict, self).items())) + + +class ListDict(NestedDict): + + def __init__(self, *args, **kwargs): + super().__init__(*args, value_type=list, overwrite=False, singleton=None, **kwargs) + + +class ModuleDict(TrackableNestedDict): + + def __init__(self, *args, **kwargs): + from tensorforce.core import Module + super().__init__(*args, value_type=Module, overwrite=False, **kwargs) + + +class SignatureDict(NestedDict): + + def __init__(self, *args, singleton=None, **kwargs): + super().__init__( + *args, value_type=tf.TensorSpec, overwrite=False, singleton=singleton, **kwargs + ) + + def __setitem__(self, key, value): + super().__setitem__(key, value) + if key is None or key == self.__class__._SINGLETON or self.is_singleton() or '/' in key or \ + not isinstance(value, self.value_type): + pass + elif value._name is None: + value._name = key + else: + assert value._name == key + + def num_args(self): + return super(NestedDict, self).__len__() + + def to_list(self, to_dict=False): + if self.is_singleton(): + spec = self.singleton() + if isinstance(spec, self.value_type): + return spec + else: + return spec.to_list() + + else: + return [ + spec if isinstance(spec, self.value_type) else ( + spec.to_dict() if to_dict else spec.to_list() + ) for spec in super(NestedDict, self).values() + if isinstance(spec, self.value_type) or len(spec) > 0 + ] + + def to_dict(self): + if self.is_singleton(): + spec = self.singleton() + if isinstance(spec, self.value_type): + return spec + else: + return spec.to_dict() + + else: + return OrderedDict(( + (name, (spec if isinstance(spec, self.value_type) else spec.to_dict())) + for name, spec in super(NestedDict, self).items() + if isinstance(spec, self.value_type) or len(spec) > 0 + )) + + def kwargs_to_args(self, *, kwargs, to_dict=False, outer_tuple=False, is_outer=True): + if self.is_singleton(): + spec = self.singleton() + if isinstance(spec, self.value_type): + if isinstance(kwargs, (tf.IndexedSlices, tf.Tensor, tf.Variable)): + # Special case: API input arguments are raw values, not singleton dicts + assert spec.is_compatible_with(spec_or_tensor=kwargs), (spec, kwargs) + return kwargs + else: + assert isinstance(kwargs, TensorDict) and kwargs.is_singleton() + arg = kwargs.singleton() + assert isinstance(arg, (tf.IndexedSlices, tf.Tensor, tf.Variable)) + assert spec.is_compatible_with(spec_or_tensor=arg), (spec, arg) + return arg + else: + return spec.kwargs_to_args(kwargs=kwargs, to_dict=to_dict, is_outer=False) + + else: + if is_outer: + assert isinstance(kwargs, (dict, list, tuple)) + else: + assert isinstance(kwargs, TensorDict), (self, kwargs) + if to_dict: + args = dict() + else: + args = list() + for index, (name, spec) in enumerate(super(NestedDict, self).items()): + if is_outer and isinstance(kwargs, (list, tuple)): + if index < len(kwargs): + arg = kwargs[index] + elif name in kwargs: + arg = kwargs[name] + if isinstance(spec, self.value_type): + assert isinstance(arg, (tf.IndexedSlices, tf.Tensor, tf.Variable)) + if isinstance(arg, tf.IndexedSlices): + # TODO: why does IndexedSlicesSpec not work? + # spec = tf.IndexedSlicesSpec( + # shape=spec.shape, dtype=spec.dtype, indices_dtype=arg.indices.dtype + # ) + # assert spec.is_compatible_with(spec_or_value=arg), (name, spec, arg) + assert tf.TensorSpec( + shape=((None,) + spec.shape[1:]), dtype=spec.dtype + ).is_compatible_with(spec_or_tensor=arg.values) + assert tf.TensorSpec( + shape=(None,), dtype=arg.indices.dtype + ).is_compatible_with(spec_or_tensor=arg.indices) + else: + assert spec.is_compatible_with(spec_or_tensor=arg), (name, spec, arg) + if to_dict: + args[name] = arg + else: + args.append(arg) + elif len(spec) == 0: + continue + else: + arg = spec.kwargs_to_args(kwargs=arg, to_dict=to_dict, is_outer=False) + if to_dict: + args[name] = arg + else: + args.append(arg) + if to_dict: + if outer_tuple and is_outer: + args = tuple(args.values()) + else: + args = tuple(args) + return args + + def args_to_kwargs(self, *, args, from_dict=False, outer_tuple=False, is_outer=True): + if self.is_singleton(): + spec = self.singleton() + if isinstance(spec, self.value_type): + assert isinstance(args, (tf.IndexedSlices, tf.Tensor, tf.Variable)), (self, args) + assert spec.is_compatible_with(spec_or_tensor=args), (spec, args) + kwargs = args + else: + kwargs = spec.args_to_kwargs(args=args, from_dict=from_dict, is_outer=False) + if outer_tuple and is_outer: + return kwargs + else: + return TensorDict(singleton=kwargs) + + else: + if is_outer: + assert isinstance(args, (dict, list, tuple)) + elif from_dict: + assert isinstance(args, dict), (self, args) + else: + assert isinstance(args, (list, tuple)), (self, args) + kwargs = TensorDict() + index = 0 + for name, spec in super(NestedDict, self).items(): + if from_dict and isinstance(args, dict): + arg = args.get(name) + elif index < len(args): + assert not from_dict or is_outer + arg = args[index] + else: + arg = None + if isinstance(spec, self.value_type): + assert isinstance(arg, (tf.IndexedSlices, tf.Tensor, tf.Variable)) + if isinstance(arg, tf.IndexedSlices): + # TODO: why does IndexedSlicesSpec not work? + # spec = tf.IndexedSlicesSpec( + # shape=spec.shape, dtype=spec.dtype, indices_dtype=arg.indices.dtype + # ) + # assert spec.is_compatible_with(spec_or_value=arg), (name, spec, arg) + assert tf.TensorSpec( + shape=((None,) + spec.shape[1:]), dtype=spec.dtype + ).is_compatible_with(spec_or_tensor=arg.values) + assert tf.TensorSpec( + shape=(None,), dtype=arg.indices.dtype + ).is_compatible_with(spec_or_tensor=arg.indices) + else: + assert spec.is_compatible_with(spec_or_tensor=arg), (name, spec, arg) + kwargs[name] = arg + index += 1 + elif len(spec) == 0: + # Recover empty arguments + # (False incompatible with TensorDict, so ensures it is never called) + kwargs[name] = spec.fmap(function=(lambda: False), cls=TensorDict) + else: + kwargs[name] = spec.args_to_kwargs( + args=arg, from_dict=from_dict, is_outer=False + ) + index += 1 + if outer_tuple and is_outer: + return tuple(super(NestedDict, kwargs).values()) + else: + return kwargs + + +class TensorDict(NestedDict): + + def __init__(self, *args, overwrite=True, singleton=None, **kwargs): + # TensorSpec required for SavedModel (presumably for spec tracing) + super().__init__( + *args, value_type=( + tf.IndexedSlices, tf.IndexedSlicesSpec, tf.Tensor, tf.TensorSpec, tf.Variable + ), overwrite=overwrite, singleton=singleton, **kwargs + ) + + def to_kwargs(self): + if self.is_singleton(): + value = self.singleton() + if isinstance(value. self.value_type): + return value + else: + return value.to_kwargs() + else: + return OrderedDict(((name, arg) for name, arg in super(NestedDict, self).items())) + + +class VariableDict(TrackableNestedDict): + + def __init__(self, *args, **kwargs): + super().__init__(*args, value_type=tf.Variable, overwrite=False, **kwargs) diff --git a/tensorforce/core/utils/nested_dict.py b/tensorforce/core/utils/nested_dict.py new file mode 100644 index 000000000..b8939775f --- /dev/null +++ b/tensorforce/core/utils/nested_dict.py @@ -0,0 +1,418 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict + +from tensorforce import TensorforceError + + +def _is_keyword(x): + return x in {'bool', 'int', 'float', 'type', 'shape', 'min_value', 'max_value', 'num_values'} + + +class NestedDict(OrderedDict): + + _SINGLETON = 'SINGLETON' + + def __init__(self, arg=None, *, value_type=None, overwrite=None, singleton=None, **kwargs): + super().__init__() + super().__setattr__('value_type', value_type) + super().__setattr__('overwrite', overwrite) + if singleton is not None: + if arg is not None or len(kwargs) > 0: + raise TensorforceError.invalid(name='NestedDict', argument='singleton') + self[None] = singleton + elif arg is None: + self.update(**kwargs) + else: + self.update(arg, **kwargs) + + def __eq__(self, other): + return type(self) is type(other) and len(self) == len(other) and \ + all(value == other.get(key) for key, value in super().items()) + + def copy(self): + if self.is_singleton(): + x = self.__class__() + value = self.singleton() + x[None] = value.copy() if hasattr(value, 'copy') else value + else: + x = self.__class__(( + (name, (value.copy() if hasattr(value, 'copy') else value)) + for name, value in super().items() + )) + super(NestedDict, x).__setattr__('value_type', self.value_type) + super(NestedDict, x).__setattr__('overwrite', self.overwrite) + return x + + def flatten(self): + return list(self.values()) + + def zip_items(self, *others): + assert all(len(other) == len(self) for other in others) + for name, value in self.items(): + assert all(name in other for other in others) + other_values = tuple(other[name] for other in others) + yield (name, value) + other_values + + def fmap(self, *, function, cls=None, with_names=False, zip_values=None): + if cls is None: + # Use same class and settings for mapped dict + values = self.__class__() + super(NestedDict, values).__setattr__('value_type', self.value_type) + super(NestedDict, values).__setattr__('overwrite', self.overwrite) + setitem = values.__setitem__ + elif issubclass(cls, list): + # Special target class list implies flatten + values = cls() + setitem = (lambda n, v: (values.extend(v) if isinstance(v, cls) else values.append(v))) + elif issubclass(cls, dict): + # Custom target class + values = cls() + setitem = values.__setitem__ + else: + raise TensorforceError.value(name='NestedDict.fmap', argument='cls', value=cls) + + for name, value in super().items(): + if name == self.__class__._SINGLETON: + name = None + + if isinstance(with_names, str): + if name is None: + full_name = with_names + else: + full_name = '{}/{}'.format(with_names, name) + else: + assert isinstance(with_names, bool) + if with_names: + if name is None and not isinstance(value, self.value_type): + full_name = True + else: + full_name = name + else: + full_name = False + + if isinstance(zip_values, (tuple, list)): + zip_value = tuple(xs[name] for xs in zip_values) + elif isinstance(zip_values, NestedDict): + zip_value = (zip_values[name],) + elif zip_values is None: + zip_value = None + else: + raise TensorforceError.type( + name='NestedDict.fmap', argument='zip_values', dtype=type(zip_values) + ) + + if isinstance(value, self.value_type): + if with_names: + args = (full_name, value) + else: + args = (value,) + if zip_value is not None: + args += zip_value + # args += tuple( + # x.singleton() if isinstance(x, NestedDict) else x for x in zip_value + # ) + setitem(name, function(*args)) + else: + setitem(name, value.fmap( + function=function, cls=cls, with_names=full_name, zip_values=zip_value + )) + + return values + + def is_singleton(self): + return super().__len__() == 1 and super().__contains__(self.__class__._SINGLETON) + + def singleton(self): + assert self.is_singleton() + return super().__getitem__(self.__class__._SINGLETON) + + def __len__(self): + return sum( + 1 if isinstance(value, self.value_type) else len(value) for value in super().values() + ) + + def __iter__(self): + for name, value in super().items(): + if name == self.__class__._SINGLETON: + if isinstance(value, self.value_type): + yield None + else: + yield from value + elif isinstance(value, self.value_type): + yield name + else: + assert isinstance(value, self.__class__) + for subname in value: + if subname is None: + yield name + else: + yield '{}/{}'.format(name, subname) + + def items(self): + for name, value in super().items(): + if name == self.__class__._SINGLETON: + if isinstance(value, self.value_type): + yield None, value + else: + yield from value.items() + elif isinstance(value, self.value_type): + yield name, value + else: + assert isinstance(value, self.__class__) + for subname, subvalue in value.items(): + if subname is None: + yield name, subvalue + else: + yield '{}/{}'.format(name, subname), subvalue + + def values(self): + for value in super().values(): + if isinstance(value, self.value_type): + yield value + else: + assert isinstance(value, self.__class__) + yield from value.values() + + def __contains__(self, item): + if item is None or item == self.__class__._SINGLETON: + assert super().__len__() == 0 or self.is_singleton() + return super().__contains__(self.__class__._SINGLETON) + + elif isinstance(item, (list, tuple)): + for name in item: + if name not in self: + return False + return True + + elif not isinstance(item, str): + raise TensorforceError.type(name='NestedDict', argument='key', dtype=type(item)) + + elif item.startswith(self.__class__._SINGLETON + '/'): + raise TensorforceError.value(name='NestedDict', argument='item', value=item) + + elif self.is_singleton(): + value = self.singleton() + if isinstance(value, self.value_type): + return False + else: + return item in value + + elif '/' in item: + item, subitem = item.split('/', 1) + if super().__contains__(item): + value = super().__getitem__(item) + assert isinstance(value, self.__class__) + return subitem in value + else: + return False + + else: + return super().__contains__(item) + + def __getitem__(self, key): + if key is None or key == self.__class__._SINGLETON: + assert self.is_singleton() + return super().__getitem__(self.__class__._SINGLETON) + + elif isinstance(key, (int, slice)): + return self.fmap(function=(lambda x: x[key])) + + elif isinstance(key, (list, tuple)): + return self.__class__(((name, self[name]) for name in key)) + + elif not isinstance(key, str): + raise TensorforceError.type(name='NestedDict', argument='key', dtype=type(key)) + + elif key.startswith(self.__class__._SINGLETON + '/'): + raise TensorforceError.value(name='NestedDict', argument='key', value=key) + + elif self.is_singleton(): + return self.singleton()[key] + + elif '/' in key: + key, subkey = key.split('/', 1) + value = super().__getitem__(key) + assert isinstance(value, self.__class__) + return value[subkey] + + else: + return super().__getitem__(key) + + def __setitem__(self, key, value): + if isinstance(value, dict) and not isinstance(value, self.value_type): + if isinstance(value, self.__class__): + value = value.copy() + else: + value = self.__class__(value) + if not isinstance(value, self.__class__) and not isinstance(value, self.value_type): + raise TensorforceError.type(name='NestedDict', argument='value', dtype=type(value)) + + if key is None or key == self.__class__._SINGLETON: + assert super().__len__() == 0 or self.is_singleton() + super().__setitem__(self.__class__._SINGLETON, value) + + elif not isinstance(key, str): + raise TensorforceError.type(name='NestedDict', argument='key', dtype=type(key)) + + elif key.startswith(self.__class__._SINGLETON + '/'): + raise TensorforceError.value(name='NestedDict', argument='key', value=key) + + elif self.is_singleton(): + self.singleton()[key] = value + + elif '/' in key: + subvalue = value + key, subkey = key.split('/', 1) + if _is_keyword(x=key): + raise TensorforceError.value( + name='NestedDict', argument='key', value=key, hint='reserved keyword' + ) + if super().__contains__(key): + value = super().__getitem__(key) + else: + value = self.__class__() + super(NestedDict, value).__setattr__('value_type', self.value_type) + super(NestedDict, value).__setattr__('overwrite', self.overwrite) + assert isinstance(value, self.__class__) + value[subkey] = subvalue + if not super().__contains__(key): + # After setting subkey since setitem may modify value (TrackableNestedDict) + self[key] = value + + else: + if _is_keyword(x=key): + raise TensorforceError.value( + name='NestedDict', argument='key', value=key, hint='reserved keyword' + ) + if not self.overwrite and super().__contains__(key): + raise TensorforceError.value( + name='NestedDict', argument='key', value=key, condition='already set' + ) + super().__setitem__(key, value) + + def __repr__(self): + return '{type}({items})'.format(type=self.__class__.__name__, items=', '.join( + '{key}={value}'.format(key=key, value=value) for key, value in super().items() + )) + + def key(self): + return next(iter(self)) + + def value(self): + return next(iter(self.values())) + + def item(self): + return next(iter(self.items())) + + def get(self, key, default=None): + if isinstance(key, (list, tuple)): + return tuple(self.get(key=x, default=default) for x in key) + elif key in self: + return self[key] + else: + return default + + def update(self, other=None, **kwargs): + if other is not None: + if hasattr(other, 'items'): + other = other.items() + for key, value in other: + if key in kwargs: + raise TensorforceError.value( + name='NestedDict.update', argument='key', value=key, + condition='specified twice' + ) + self[key] = value + for key, value in kwargs.items(): + self[key] = value + + def pop(self, key, default=None): + if key is None or key == self.__class__._SINGLETON: + assert super().__len__() == 0 or self.is_singleton() + if super().__contains__(self.__class__._SINGLETON): + value = super().__getitem__(self.__class__._SINGLETON) + super().__delitem__(self.__class__._SINGLETON) + else: + value = default + return value + + elif not isinstance(key, str): + raise TensorforceError.type(name='NestedDict', argument='key', dtype=type(key)) + + elif key.startswith(self.__class__._SINGLETON + '/'): + raise TensorforceError.value(name='NestedDict', argument='key', value=key) + + elif self.is_singleton(): + value = self.singleton() + if isinstance(value, self.value_type): + return default + else: + return value.pop(key, default=default) + + elif '/' in key: + key, subkey = key.split('/', 1) + if super().__contains__(key): + value = super().__getitem__(key) + assert isinstance(value, self.__class__) + return value.pop(subkey, default) + else: + return default + + else: + # TODO: can't use pop since __delitem__ not implemented + if super().__contains__(key): + value = super().__getitem__(key) + super().__delitem__(key) + return value + else: + return default + + __str__ = __repr__ + + has_key = __contains__ + + keys = __iter__ + iterkeys = __iter__ + viewkeys = __iter__ + + itervalues = values + viewvalues = values + + iteritems = items + viewitems = items + + def __setattr__(self, name, value): + raise NotImplementedError + + def __delattr__(self, name): + raise NotImplementedError + + def __delitem__(self, key): + raise NotImplementedError + + def clear(self): + raise NotImplementedError + + @classmethod + def fromkeys(cls, iterable, value=None): + raise NotImplementedError + + def popitem(self): + raise NotImplementedError + + def setdefault(self, key, default=None): + raise NotImplementedError diff --git a/tensorforce/core/utils/tensor_spec.py b/tensorforce/core/utils/tensor_spec.py new file mode 100644 index 000000000..0dabbdede --- /dev/null +++ b/tensorforce/core/utils/tensor_spec.py @@ -0,0 +1,805 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from functools import total_ordering + +import numpy as np +import tensorflow as tf + +from tensorforce import TensorforceError, util +from tensorforce.core.utils import tf_util + + +def _normalize_type(*, dtype): + if isinstance(dtype, np.dtype): + dtype = dtype.type + dtypes = { + 'bool': 'bool', bool: 'bool', np.bool8: 'bool', tf.bool: 'bool', + 'int': 'int', int: 'int', + np.int8: 'int', np.int16: 'int', np.int32: 'int', np.int64: 'int', + np.uint8: 'int', np.uint16: 'int', np.uint32: 'int', np.uint64: 'int', + tf.int16: 'int', tf.int32: 'int', tf.int64: 'int', + 'float': 'float', float: 'float', + np.float16: 'float', np.float32: 'float', np.float64: 'float', + tf.float16: 'float', tf.float32: 'float', tf.float64: 'float' + } + return dtypes.get(dtype, None) + + +@total_ordering +class TensorSpec(object): + + def __init__( + self, *, type, shape=(), min_value=None, max_value=None, num_values=None, overwrite=False + ): + super().__setattr__('overwrite', True) + super().__setattr__('type', None) + if isinstance(type, tf.dtypes.DType): + super().__setattr__('_tf_type', type) + assert not overwrite + self.type = type + self.shape = shape + if num_values is not None and (min_value is not None or max_value is not None): + if self.type == 'int': + raise TensorforceError.invalid( + name='TensorSpec', argument='min/max_value', + condition='type is int and num_values specified' + ) + elif self.type == 'float': + raise TensorforceError.invalid( + name='TensorSpec', argument='num_values', + condition='type is float and min/max_value specified' + ) + if min_value is not None: + self.min_value = min_value + if max_value is not None: + self.max_value = max_value + if num_values is not None: + self.num_values = num_values + super().__setattr__('overwrite', overwrite) + + def __eq__(self, other): + return type(other) is TensorSpec and self.type == other.type and \ + self.shape == other.shape and self.min_value == other.min_value and \ + self.max_value == other.max_value and self.num_values == other.num_values + + @property + def rank(self): + return len(self.shape) + + @property + def size(self): + return util.product(xs=self.shape) + + def py_type(self): + if self.type == 'bool': + return bool + elif self.type == 'int': + return int + elif self.type == 'float': + return float + + def np_type(self): + return util.np_dtype(dtype=self.type) + + def tf_type(self): + if hasattr(self, '_tf_type'): + return self._tf_type + else: + return tf_util.DTYPE_MAPPING[self.type] + + def is_underspecified(self): + if self.type is None or isinstance(self.type, tuple): + return True + elif self.shape is None or (len(self.shape) > 0 and self.shape[0] is None) or \ + any(x <= 0 for x in self.shape if x is not None): + return True + elif self.type == 'int' and self.num_values is not None and self.num_values <= 0: + return True + else: + return False + + def json(self): + if self.type == 'bool': + return dict(type=self.type, shape=self.shape) + + elif self.type == 'int' and self.num_values is not None: + return dict(type=self.type, shape=self.shape, num_values=self.num_values) + + else: + spec = dict(type=self.type, shape=self.shape) + if self.min_value is not None: + if isinstance(self.min_value, np.ndarray): + spec['min_value'] = self.min_value.tolist() + else: + spec['min_value'] = self.min_value + if self.max_value is not None: + if isinstance(self.max_value, np.ndarray): + spec['max_value'] = self.max_value.tolist() + else: + spec['max_value'] = self.max_value + return spec + + def signature(self, *, batched): + # Check whether underspecified + if self.is_underspecified(): + raise TensorforceError.unexpected() + + # Add leading variable-dim axis if batched + if batched: + shape = (None,) + self.shape + else: + shape = self.shape + + # TensorFlow TensorSpec + return tf.TensorSpec(shape=tf.TensorShape(dims=shape), dtype=self.tf_type()) + + def to_tensor(self, *, value, batched, recover_empty=False, name='TensorSpec.to_tensor'): + # Check whether underspecified + if self.is_underspecified(): + raise TensorforceError.unexpected() + + # Convert value to Numpy array, checks type + value = np.asarray(a=value, dtype=self.np_type()) + + # Check whether shape matches + if value.shape[int(batched):] != self.shape: + raise TensorforceError.value( + name=name, argument='value shape', value=value.shape[int(batched):], + hint='!= {}'.format(self.shape) + ) + + # Check for nan or inf + if np.isnan(value).any() or np.isinf(value).any(): + raise TensorforceError.value( + name=name, argument='value', value=value, hint='contains nan/inf' + ) + + # Check num_values + if self.type == 'int' and self.num_values is not None: + if (value < 0).any() or (value >= self.num_values).any(): + raise TensorforceError.value( + name=name, argument='value', value=value, + hint='not in [0, {}] (num_values)'.format(self.num_values - 1) + ) + + # Check min/max_value + elif self.type == 'int' or self.type == 'float': + if self.min_value is not None: + if (value < self.min_value).any(): + raise TensorforceError.value( + name=name, argument='value', value=value, + hint='< {} (min_value)'.format(self.min_value) + ) + if self.max_value is not None: + if (value > self.max_value).any(): + raise TensorforceError.value( + name=name, argument='value', value=value, + hint='> {} (max_value)'.format(self.max_value) + ) + + # Convert Numpy array to TensorFlow tensor + return tf.convert_to_tensor(value=value, dtype=self.tf_type()) + + def from_tensor(self, *, tensor, batched, name='TensorSpec.from_tensor'): + # Check whether underspecified + if self.is_underspecified(): + raise TensorforceError.unexpected() + + # Check whether TensorFlow tensor + if not isinstance(tensor, tf.Tensor): + raise TensorforceError.type(name=name, argument='tensor', dtype=type(tensor)) + + # Check whether tensor type and shape match + if tf_util.dtype(x=tensor) != self.type: + raise TensorforceError.value(name=name, argument='tensor.dtype', value=tensor) + if tf_util.shape(x=tensor)[int(batched):] != self.shape: + raise TensorforceError.value(name=name, argument='tensor.shape', value=tensor) + + # Convert tensor value to Numpy array + value = tensor.numpy() + + # Check for nan or inf + if np.isnan(value).any() or np.isinf(value).any(): + raise TensorforceError.value( + name=name, argument='tensor', value=value + ) + + # Check num_values + if self.type == 'int' and self.num_values is not None: + if (value < 0).any() or (value >= self.num_values).any(): + raise TensorforceError.value(name=name, argument='tensor', value=value) + + # Check min/max_value + elif self.type == 'int' or self.type == 'float': + if self.min_value is not None: + if (value < self.min_value).any(): + raise TensorforceError.value(name=name, argument='tensor', value=value) + if self.max_value is not None: + if (value > self.max_value).any(): + raise TensorforceError.value(name=name, argument='tensor', value=value) + + # If singleton shape, return Python object instead of Numpy array + if self.shape == () and not batched: + value = value.item() + + return value + + def np_assert(self, *, x, message, batched=False): + if message is not None and '{name}' in message: + message = message.format(name='', issue='{issue}') + + if batched or self.shape != (): + x = np.asarray(x) + if isinstance(x, np.ndarray): + if _normalize_type(dtype=x.dtype) != self.type and \ + (self.type != 'float' or _normalize_type(dtype=x.dtype) != 'int') and \ + (self.type != 'int' or ( + _normalize_type(dtype=x.dtype) == 'bool' and self.num_values != 2 + )): + raise TensorforceError( + message.format(issue=('type {} != {}'.format(x.dtype, self.type))) + ) + elif x.shape[int(batched):] != self.shape: + raise TensorforceError( + message.format(issue=('shape {} != {}'.format(x.shape, self.shape))) + ) + elif batched or self.shape != (): + if x.ndim >= 1 and batched: + raise TensorforceError( + message.format(issue=('shape {} != {}'.format(x.shape[1:], self.shape))) + ) + else: + raise TensorforceError( + message.format(issue=('shape {} != {}'.format(x.shape, self.shape))) + ) + elif isinstance(x, np.generic): + if _normalize_type(dtype=x.dtype) != self.type and \ + (self.type != 'float' or _normalize_type(dtype=x.dtype) != 'int'): + raise TensorforceError( + message.format(issue=('type {} != {}'.format(x.dtype, self.type))) + ) + elif self.type == 'bool' and isinstance(x, bool): + pass + elif self.type == 'int' and isinstance(x, int): + pass + elif self.type == 'float' and isinstance(x, (int, float)): + pass + else: + raise TensorforceError( + message.format(issue=('type {} != {}'.format(type(x), self.type))) + ) + + def tf_assert(self, *, x, batch_size=None, include_type_shape=False, message=None): + if not isinstance(x, (tf.Tensor, tf.Variable)): + raise TensorforceError.type(name='TensorSpec.tf_assert', argument='x', dtype=type(x)) + + if batch_size is None: + pass + elif not isinstance(batch_size, tf.Tensor): + raise TensorforceError.type( + name='TensorSpec.tf_assert', argument='batch_size', dtype=type(batch_size) + ) + elif tf_util.dtype(x=batch_size) != 'int' or tf_util.shape(x=batch_size) != (): + raise TensorforceError.value( + name='TensorSpec.tf_assert', argument='batch_size', value=batch_size + ) + assertions = list() + + if message is not None and '{name}' in message: + message = message.format(name='', issue='{issue}') + + # Type + tf.debugging.assert_type( + tensor=x, tf_type=self.tf_type(), + message=(None if message is None else message.format(issue='type')) + ) + + # Shape + shape = tf_util.constant(value=self.shape, dtype='int') + if batch_size is not None: + shape = tf.concat(values=(tf.expand_dims(input=batch_size, axis=0), shape), axis=0) + assertions.append( + tf.debugging.assert_equal( + x=tf_util.cast(x=tf.shape(input=x), dtype='int'), y=shape, + message=(None if message is None else message.format(issue='shape')) + ) + ) + + if self.type == 'float': + assertions.append(tf.debugging.assert_all_finite( + x=x, message=('' if message is None else message.format(issue='inf/nan value')) + )) + + # Min/max value (includes num_values) + if self.type != 'bool' and self.min_value is not None: + assertions.append(tf.debugging.assert_greater_equal( + x=x, y=tf_util.constant(value=self.min_value, dtype=self.type), + message=(None if message is None else message.format(issue='min value')) + )) + if self.type != 'bool' and self.max_value is not None: + assertions.append(tf.debugging.assert_less_equal( + x=x, y=tf_util.constant(value=self.max_value, dtype=self.type), + message=(None if message is None else message.format(issue='max value')) + )) + + return assertions + + def unify(self, *, other, name='TensorSpec.unify'): + # Unify type + if self.type is None: + dtype = other.type + elif other.type is None: + dtype = self.type + elif util.is_iterable(x=self.type): + if util.is_iterable(x=other.type): + if set(self.type) <= set(other.type): + dtype = self.type + elif set(other.type) <= set(self.type): + dtype = other.type + else: + raise TensorforceError.mismatch( + name=name, argument='type', value1=self.type, value2=other.type + ) + elif other.type in self.type: + dtype = other.type + else: + raise TensorforceError.mismatch( + name=name, argument='type', value1=self.type, value2=other.type + ) + elif util.is_iterable(x=other.type): + if self.type in other.type: + dtype = self.type + else: + raise TensorforceError.mismatch( + name=name, argument='type', value1=self.type, value2=other.type + ) + elif self.type == other.type: + dtype = self.type + else: + raise TensorforceError.mismatch( + name=name, argument='type', value1=self.type, value2=other.type + ) + + # Unify shape + if self.shape is None: + shape = other.shape + elif other.shape is None: + shape = self.shape + else: + reverse_shape = list() + start = len(self.shape) - 1 + if self.shape[-1] is None: + reverse_shape.extend(other.shape[len(self.shape) - 1:]) + start = len(self.shape) - 2 + elif other.shape[-1] is None: + reverse_shape.extend(self.shape[len(other.shape) - 1:]) + start = len(other.shape) - 2 + elif len(self.shape) != len(other.shape): + raise TensorforceError.mismatch( + name=name, argument='rank', value1=self.rank, value2=other.rank + ) + for n in range(start, -1, -1): + if self.shape[n] == 0: + reverse_shape.append(other.shape[n]) + elif other.shape[n] == 0: + reverse_shape.append(self.shape[n]) + elif self.shape[n] == -1 and other.shape[n] > 0: + reverse_shape.append(other.shape[n]) + elif other.shape[n] == -1 and self.shape[n] > 0: + reverse_shape.append(self.shape[n]) + elif self.shape[n] == other.shape[n]: + reverse_shape.append(self.shape[n]) + else: + raise TensorforceError.mismatch( + name=name, argument='shape', value1=self.shape, value2=other.shape + ) + shape = tuple(reversed(reverse_shape)) + + # Unify min_value + if dtype == 'bool': + min_value = None + elif self.type != 'bool' and self.min_value is not None: + if other.type != 'bool' and other.min_value is not None: + if isinstance(self.min_value, np.ndarray) or \ + isinstance(other.min_value, np.ndarray): + min_value = np.minimum(self.min_value, other.min_value) + elif self.min_value < other.min_value: + min_value = other.min_value + else: + min_value = self.min_value + else: + min_value = self.min_value + elif other.type != 'bool' and other.min_value is not None: + min_value = other.min_value + else: + min_value = None + + # Unify max_value + if dtype == 'bool': + max_value = None + elif self.type != 'bool' and self.max_value is not None: + if other.type != 'bool' and other.max_value is not None: + if isinstance(self.max_value, np.ndarray) or \ + isinstance(other.max_value, np.ndarray): + max_value = np.maximum(self.max_value, other.max_value) + elif self.max_value < other.max_value: + max_value = other.max_value + else: + max_value = self.max_value + else: + max_value = self.max_value + elif other.type != 'bool' and other.max_value is not None: + max_value = other.max_value + else: + max_value = None + if min_value is not None and max_value is not None: + if isinstance(min_value, np.ndarray) or isinstance(max_value, np.ndarray): + if (min_value > max_value).any(): + raise TensorforceError.mismatch( + name=name, argument='min/max_value', value1=min_value, value2=max_value + ) + else: + if min_value > max_value: + raise TensorforceError.mismatch( + name=name, argument='min/max_value', value1=min_value, value2=max_value + ) + + # Unify num_values + if dtype != 'int' and (not isinstance(dtype, tuple) or 'int' not in dtype): + num_values = None + elif self.type == 'int' and self.num_values is not None: + if other.type == 'int' and other.num_values is not None: + if self.num_values == 0: + num_values = other.num_values + elif other.num_values == 0: + num_values = self.num_values + elif self.num_values == other.num_values: + num_values = self.num_values + else: + raise TensorforceError.mismatch( + name=name, argument='num_values', value1=self.num_values, + value2=other.num_values + ) + else: + num_values = self.num_values + elif other.type == 'int' and other.num_values is not None: + num_values = other.num_values + else: + num_values = None + if num_values is not None: + min_value = None + max_value = None + + # Unified tensor spec + return TensorSpec( + type=dtype, shape=shape, min_value=min_value, max_value=max_value, num_values=num_values + ) + + # def __len__(self): + # return 1 + + # def __iter__(self): + # return + # yield + + # def values(self): + # yield self + + # def items(self): + # yield None, self + + # def value(self): + # return self + + def copy(self, *, overwrite=None): + if overwrite is None: + overwrite = self.overwrite + + if self.type == 'bool': + return TensorSpec(type=self.type, shape=self.shape, overwrite=overwrite) + + elif self.type == 'int' and self.num_values is not None: + return TensorSpec( + type=self.type, shape=self.shape, num_values=self.num_values, overwrite=overwrite + ) + + else: + return TensorSpec( + type=self.type, shape=self.shape, min_value=self.min_value, + max_value=self.max_value, overwrite=overwrite + ) + + # def fmap(self, *, function, cls=None, with_names=False, zip_values=None): + # args = (self,) + + # # with_names + # if with_names: + # args = (None,) + args + + # # zip_values + # if isinstance(zip_values, (tuple, list)): + # for value in zip_values: + # if isinstance(value, NestedDict): + # assert len(value) == 1 and None in value + # args += (value[None],) + # else: + # args += (value,) + # elif isinstance(zip_values, NestedDict): + # assert len(zip_values) == 1 and None in zip_values + # args += (zip_values[None],) + # elif zip_values is not None: + # args += (zip_values,) + + # # Actual value mapping + # value = function(*args) + + # # from tensorforce.core import TensorDict + # if cls is None: # or issubclass(cls, TensorDict): + # # Use same class + # return value + + # elif issubclass(cls, list): + # # Special target class list implies flatten + # values = cls() + # if isinstance(value, cls): + # values.extend(value) + # else: + # values.append(value) + # return values + + # elif issubclass(cls, dict): + # # Custom target class + # values = cls() + # values[None] = value + # return values + + # else: + # raise TensorforceError.value(name='TensorSpec.fmap', argument='cls', value=cls) + + def __setattr__(self, name, value): + if not self.overwrite: + raise NotImplementedError + + if name == 'type': + if value is None: + # Type: None + pass + elif util.is_iterable(x=value): + # Type: tuple(*types) + if any(_normalize_type(dtype=x) is None for x in value): + raise TensorforceError.value(name='TensorSpec', argument=name, value=value) + value = tuple(_normalize_type(dtype=x) for x in value) + else: + # Type: 'bool' | 'int' | 'float' + if _normalize_type(dtype=value) is None: + raise TensorforceError.value(name='TensorSpec', argument=name, value=value) + value = _normalize_type(dtype=value) + + # Delete attributes not required anymore + if self.type is not None and self.type != 'bool' and value == 'bool': + super().__delattr__('min_value') + super().__delattr__('max_value') + if self.type is not None and ( + self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type) + ) and value != 'int' and (not isinstance(value, tuple) or 'int' not in value): + super().__delattr__('num_values') + + # Set type attribute + super().__setattr__(name, value) + + # Reset attributes + if self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type): + self.min_value = None + self.max_value = None + self.num_values = None + elif self.type != 'bool': + self.min_value = None + self.max_value = None + + elif name == 'shape': + if value is None: + # Shape: None + pass + elif util.is_iterable(x=value): + if len(value) > 0 and value[0] is None: + # Shape: tuple(None, *ints >= -1) + try: + value = (None,) + tuple(int(x) for x in value[1:]) + if any(x < -1 for x in value[1:]): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value + ) + except BaseException: + raise TensorforceError.type( + name='TensorSpec', argument=name, value=type(value) + ) + else: + # Shape: tuple(*ints >= -1) + try: + value = tuple(int(x) for x in value) + if any(x < -1 for x in value): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value + ) + except BaseException: + raise TensorforceError.value(name='TensorSpec', argument=name, value=value) + else: + # Shape: (int >= -1,) + try: + value = (int(value),) + if value[0] < -1: + raise TensorforceError.value(name='TensorSpec', argument=name, value=value) + except BaseException: + raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value)) + + # TODO: check min/max_value shape if np.ndarray + + # Set shape attribute + super().__setattr__(name, value) + + elif name == 'min_value' or name == 'max_value': + # Invalid for type == 'bool', or type == 'int' and num_values != None + if self.type == 'bool': + raise TensorforceError.invalid( + name='TensorSpec', argument=name, condition='type is bool' + ) + + if value is None: + # Min/max value: None + pass + else: + # Min/max value: int/float + try: + value = self.py_type()(value) + if self.type == 'int' and self.num_values is not None: + if name == 'min_value': + assert value == 0 + elif name == 'max_value': + assert value == self.num_values - 1 + except BaseException: + try: + value = np.asarray(value, dtype=self.np_type()) + if self.type == 'int': + assert self.num_values is None + except BaseException: + raise TensorforceError.type( + name='TensorSpec', argument=name, value=type(value) + ) + + if isinstance(value, np.ndarray): + if self.shape is not None and ( + value.ndim > len(self.shape) or value.shape != self.shape[:value.ndim] + ): + raise TensorforceError.value( + name='TensorSpec', argument=(name + ' shape'), value=value.shape, + hint='incompatible with {}'.format(self.shape) + ) + if name == 'min_value' and self.max_value is not None and \ + (value > self.max_value - util.epsilon).any(): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='max_value = {}'.format(self.max_value) + ) + elif name == 'max_value' and self.min_value is not None and \ + (value < self.min_value + util.epsilon).any(): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='min_value = {}'.format(self.min_value) + ) + else: + if name == 'min_value' and self.max_value is not None: + if isinstance(self.max_value, np.ndarray): + if (value > self.max_value - util.epsilon).any(): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='max_value = {}'.format(self.max_value) + ) + elif value > self.max_value - util.epsilon: + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='max_value = {}'.format(self.max_value) + ) + elif name == 'max_value' and self.min_value is not None: + if isinstance(self.min_value, np.ndarray): + if (value < self.min_value + util.epsilon).any(): + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='min_value = {}'.format(self.min_value) + ) + elif value < self.min_value + util.epsilon: + raise TensorforceError.value( + name='TensorSpec', argument=name, value=value, + condition='min_value = {}'.format(self.min_value) + ) + + # Set min/max_value attribute + super().__setattr__(name, value) + + elif name == 'num_values': + # Invalid for type != 'int' + if self.type != 'int' and (not isinstance(self.type, tuple) or 'int' not in self.type): + raise TensorforceError.invalid( + name='TensorSpec', argument=name, condition='type is {}'.format(self.type) + ) + + if value is None: + # Num values: None + pass + else: + # Num values: int >= 0 + try: + value = int(value) + except BaseException: + raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value)) + if value < 0: + raise TensorforceError.value(name='TensorSpec', argument=name, value=value) + + # Set num_values attribute and min/max_value accordingly + super().__setattr__(name, value) + if value is not None and value > 0: + super().__setattr__('min_value', 0) + super().__setattr__('max_value', value - 1) + else: + super().__setattr__('min_value', None) + super().__setattr__('max_value', None) + + else: + raise TensorforceError.invalid(name='TensorSpec', argument=name) + + def __repr__(self): + if self.type == 'int' and self.num_values is not None: + return 'TensorSpec(type={type}, shape={shape}, num_values={num_values})'.format( + type=self.type, shape=self.shape, num_values=self.num_values + ) + elif self.type != 'bool' and self.min_value is not None: + if self.max_value is None: + return 'TensorSpec(type={type}, shape={shape}, min_value={min_value})'.format( + type=self.type, shape=self.shape, min_value=self.min_value + ) + else: + return ('TensorSpec(type={type}, shape={shape}, min_value={min_value}, max_value=' + '{max_value})').format( + type=self.type, shape=self.shape, min_value=self.min_value, + max_value=self.max_value + ) + elif self.type != 'bool' and self.max_value is not None: + return 'TensorSpec(type={type}, shape={shape}, max_value={max_value})'.format( + type=self.type, shape=self.shape, max_value=self.max_value + ) + else: + return 'TensorSpec(type={type}, shape={shape})'.format(type=self.type, shape=self.shape) + + __str__ = __repr__ + + def tuple(self): + return ( + self.type, self.shape, getattr(self, 'min_value', None), + getattr(self, 'max_value', None), getattr(self, 'num_values', None) + ) + + def __hash__(self): + return hash(self.tuple()) + + def __eq__(self, other): + return isinstance(other, TensorSpec) and self.tuple() == other.tuple() + + def __lt__(self, other): + if not isinstance(other, TensorSpec): + return NotImplementedError + return self.tuple() < other.tuple() + + def __delattr__(self, name): + raise NotImplementedError diff --git a/tensorforce/core/utils/tensors_spec.py b/tensorforce/core/utils/tensors_spec.py new file mode 100644 index 000000000..4a50ea05e --- /dev/null +++ b/tensorforce/core/utils/tensors_spec.py @@ -0,0 +1,126 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce import TensorforceError +from tensorforce.core.utils import ArrayDict, NestedDict, SignatureDict, TensorDict, TensorSpec + + +class TensorsSpec(NestedDict): + + def __init__(self, *args, singleton=None, **kwargs): + super().__init__( + *args, value_type=TensorSpec, overwrite=False, singleton=singleton, **kwargs + ) + + def signature(self, *, batched): + return self.fmap(function=(lambda spec: spec.signature(batched=batched)), cls=SignatureDict) + + def to_tensor(self, *, value, batched, recover_empty=False, name='TensorSpec.to_tensor'): + if value is not None and not isinstance(value, ArrayDict): + raise TensorforceError.type(name=name, argument='value', dtype=type(value)) + + # TODO: improve exception message to include invalid keys + if not recover_empty and set(value) != set(self): + raise TensorforceError.value(name=name, argument='value', value=value) + + tensor = TensorDict() + for name, spec in super(NestedDict, self).items(): + if recover_empty and name not in value: + assert not isinstance(spec, self.value_type) and len(spec) == 0 + tensor[name] = spec.to_tensor( + value=None, batched=batched, recover_empty=recover_empty, name=name + ) + else: + tensor[name] = spec.to_tensor( + value=value[name], batched=batched, recover_empty=recover_empty, name=name + ) + return tensor + + def from_tensor(self, *, tensor, batched, name='TensorSpec.from_tensor'): + if not isinstance(tensor, TensorDict): + raise TensorforceError.type(name=name, argument='tensor', dtype=type(tensor)) + + # TODO: improve exception message to include invalid keys + if set(tensor) != set(self): + raise TensorforceError.value(name=name, argument='tensor', value=tensor) + + value = ArrayDict() + for name, spec in super(NestedDict, self).items(): + value[name] = spec.from_tensor(tensor=tensor[name], batched=batched, name=name) + return value + + def np_assert(self, *, x, message, batched=False): + if not isinstance(x, dict): + raise TensorforceError( + message.format(name='', issue=('type {} != dict'.format(type(x)))) + ) + + for name, spec, x in self.zip_items(x): + if name is None: + name = '' + spec.np_assert(x=x, message=( + None if message is None else message.format(name=name, issue='{issue}') + )) + + def tf_assert(self, *, x, batch_size=None, include_type_shape=False, message=None): + if not isinstance(x, TensorDict): + raise TensorforceError( + message.format(name='', issue=('type {} != TensorDict'.format(type(x)))) + ) + + assertions = list() + for name, spec, x in self.zip_items(x): + if name is None: + name = '' + assertions.extend(spec.tf_assert( + x=x, batch_size=batch_size, include_type_shape=include_type_shape, + message=(None if message is None else message.format(name=name, issue='{issue}')) + )) + + return assertions + + def unify(self, *, other): + if set(self) != set(other): + raise TensorforceError.mismatch( + name='TensorsSpec.unify', argument='keys', value1=sorted(self), value2=sorted(other) + ) + return self.fmap(function=(lambda x, y: x.unify(other=y)), zip_values=other) + + def __setitem__(self, key, value): + if not isinstance(value, TensorSpec) and not isinstance(value, TensorsSpec): + if not isinstance(value, dict): + raise TensorforceError.type(name='TensorsSpec', argument='value', dtype=type(value)) + elif 'type' in value or 'shape' in value: + value = TensorSpec(**value, overwrite=self.overwrite) + else: + value = TensorsSpec(value) + + if key == 'horizons': + if not isinstance(value, TensorSpec) or value.type != 'int' or value.shape != (2,): + raise TensorforceError.value(name='TensorsSpec', argument='horizons', value=value) + + elif key == 'parallel': + if not isinstance(value, TensorSpec) or value.type != 'int' or value.shape != (): + raise TensorforceError.value(name='TensorsSpec', argument='parallel', value=value) + + elif key == 'reward': + if not isinstance(value, TensorSpec) or value.type != 'float' or value.shape != (): + raise TensorforceError.value(name='TensorsSpec', argument='reward', value=value) + + elif key == 'terminal': + if not isinstance(value, TensorSpec) or value.type != 'int' or value.shape != (): + raise TensorforceError.value(name='TensorsSpec', argument='terminal', value=value) + + super().__setitem__(key, value) diff --git a/tensorforce/core/utils/tf_util.py b/tensorforce/core/utils/tf_util.py new file mode 100644 index 000000000..8db283609 --- /dev/null +++ b/tensorforce/core/utils/tf_util.py @@ -0,0 +1,135 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from tensorforce import TensorforceError + + +DTYPE_MAPPING = dict(bool=tf.dtypes.bool, int=tf.dtypes.int64, float=tf.dtypes.float32) + + +def is_tensor(*, x): + return isinstance(x, (tf.IndexedSlices, tf.Tensor, tf.Variable)) + + +def get_dtype(*, type): + if type not in DTYPE_MAPPING: + raise TensorforceError.value( + name='tf_util.cast', argument='type', value=type, + hint='not in {{{}}}'.format(','.join(DTYPE_MAPPING)) + ) + return DTYPE_MAPPING[type] + + +def dtype(*, x=None, dtype=None, fallback_tf_dtype=False): + for dtype, tf_dtype in DTYPE_MAPPING.items(): + if x.dtype == tf_dtype: + return dtype + else: + if fallback_tf_dtype: + return x.dtype + raise TensorforceError.value(name='tf_util.dtype', argument='x.dtype', value=x.dtype) + + +def rank(*, x): + return x.get_shape().ndims + + +def shape(*, x, unknown=-1): + return tuple(unknown if dims is None else dims for dims in x.get_shape().as_list()) + + +# def is_dtype(x, dtype): +# for str_dtype, tf_dtype in tf_dtype_mapping.items(): +# if x.dtype == tf_dtype and dtype == str_dtype: +# return True +# else: +# return False +# # if x.dtype == tf.float32: +# # return 'float' +# # else: +# # raise TensorforceError.value(name='util.dtype', argument='x', value=x.dtype) + + +# Conversion to generally supported TensorFlow type + + +def int32(x): + if dtype(x=x) != 'int' or get_dtype(type='int') != tf.int32: + x = tf.cast(x=x, dtype=tf.int32) + return x + + +def float32(x): + if dtype(x=x) != 'float' or get_dtype(type='float') != tf.float32: + x = tf.cast(x=x, dtype=tf.float32) + return x + + +# TensorFlow functions + + +def constant(*, value, dtype, shape=None): + return tf.constant(value=value, dtype=get_dtype(type=dtype), shape=shape) + + +def zeros(*, shape, dtype): + return tf.zeros(shape=shape, dtype=get_dtype(type=dtype)) + + +def ones(*, shape, dtype): + return tf.ones(shape=shape, dtype=get_dtype(type=dtype)) + + +def identity(input, name=None): + zero = tf.zeros_like(input=input) + if zero.dtype is tf.bool: + return tf.math.logical_or(x=input, y=zero, name=name) + else: + return tf.math.add(x=input, y=zero, name=name) + + +# def no_op(): +# return identity(input=constant(value=False, dtype='bool')) + + +def cast(*, x, dtype): + for str_dtype, tf_dtype in DTYPE_MAPPING.items(): + if x.dtype == tf_dtype and dtype == str_dtype: + return x + else: + return tf.cast(x=x, dtype=get_dtype(type=dtype)) + + +# Other helper functions + + +def always_true(*args, **kwargs): + return constant(value=True, dtype='bool') + + +def lift_indexedslices(binary_op, x, y, with_assertions): + if isinstance(x, tf.IndexedSlices): + assert isinstance(y, tf.IndexedSlices) + assertions = list() + if with_assertions: + assertions.append(tf.debugging.assert_equal(x=x.indices, y=y.indices)) + with tf.control_dependencies(control_inputs=assertions): + return tf.IndexedSlices( + values=binary_op(x.values, y.values), indices=x.indices, dense_shape=x.dense_shape + ) + else: + return binary_op(x, y) diff --git a/tensorforce/environments/__init__.py b/tensorforce/environments/__init__.py index f2cba97e7..cee9582ca 100644 --- a/tensorforce/environments/__init__.py +++ b/tensorforce/environments/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,37 @@ # limitations under the License. # ============================================================================== +from tensorforce.environments.environment import Environment, RemoteEnvironment -from tensorforce.environments.environment import Environment -from tensorforce.tests.minimal_test import MinimalTest +from tensorforce.environments.multiprocessing_environment import MultiprocessingEnvironment +from tensorforce.environments.socket_environment import SocketEnvironment + +from tensorforce.environments.arcade_learning_environment import ArcadeLearningEnvironment +from tensorforce.environments.openai_gym import OpenAIGym +from tensorforce.environments.openai_retro import OpenAIRetro +from tensorforce.environments.open_sim import OpenSim +from tensorforce.environments.pygame_learning_environment import PyGameLearningEnvironment +from tensorforce.environments.vizdoom import ViZDoom +from tensorforce.environments.carla_environment import CARLAEnvironment + +from tensorforce.environments.cartpole import CartPole environments = dict( - minimal_test=MinimalTest, + default=OpenAIGym, + ale=ArcadeLearningEnvironment, arcade_learning_environment=ArcadeLearningEnvironment, + custom_cartpole=CartPole, + gym=OpenAIGym, openai_gym=OpenAIGym, + retro=OpenAIRetro, openai_retro=OpenAIRetro, + osim=OpenSim, open_sim=OpenSim, + ple=PyGameLearningEnvironment, pygame_learning_environment=PyGameLearningEnvironment, + vizdoom=ViZDoom, + carla=CARLAEnvironment, carla_environment=CARLAEnvironment ) -__all__ = ['Environment', 'MinimalTest'] + +__all__ = [ + 'ArcadeLearningEnvironment', 'Environment', 'MazeExplorer', 'MultiprocessingEnvironment', + 'OpenAIGym', 'OpenAIRetro', 'OpenSim', 'PyGameLearningEnvironment', 'RemoteEnvironment', + 'SocketEnvironment', 'ViZDoom', 'CARLAEnvironment' +] diff --git a/tensorforce/environments/arcade_learning_environment.py b/tensorforce/environments/arcade_learning_environment.py new file mode 100644 index 000000000..aea545c03 --- /dev/null +++ b/tensorforce/environments/arcade_learning_environment.py @@ -0,0 +1,117 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np + +from tensorforce.environments import Environment + + +class ArcadeLearningEnvironment(Environment): + """ + [Arcade Learning Environment](https://github.com/mgbellemare/Arcade-Learning-Environment) + adapter (specification key: `ale`, `arcade_learning_environment`). + + May require: + ```bash + sudo apt-get install libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake + ``` + + Args: + level (string): ALE rom file + (required). + loss_of_life_termination: Signals a terminal state on loss of life + (default: false). + loss_of_life_reward (float): Reward/Penalty on loss of life (negative values are a penalty) + (default: 0.0). + repeat_action_probability (float): Repeats last action with given probability + (default: 0.0). + visualize (bool): Whether to visualize interaction + (default: false). + frame_skip (int > 0): Number of times to repeat an action without observing + (default: 1). + seed (int): Random seed + (default: none). + """ + + def __init__( + self, level, life_loss_terminal=False, life_loss_punishment=0.0, + repeat_action_probability=0.0, visualize=False, frame_skip=1, seed=None + ): + super().__init__() + + from ale_py import ALEInterface + + self.environment = ALEInterface() + self.rom_file = level + + self.life_loss_terminal = life_loss_terminal + self.life_loss_punishment = life_loss_punishment + + self.environment.setFloat(b'repeat_action_probability', repeat_action_probability) + self.environment.setBool(b'display_screen', visualize) + self.environment.setInt(b'frame_skip', frame_skip) + if seed is not None: + self.environment.setInt(b'random_seed', seed) + + # All set commands must be done before loading the ROM. + self.environment.loadROM(self.rom_file.encode()) + self.available_actions = tuple(self.environment.getLegalActionSet()) + + # Full list of actions: + # No-Op, Fire, Up, Right, Left, Down, Up Right, Up Left, Down Right, Down Left, Up Fire, + # Right Fire, Left Fire, Down Fire, Up Right Fire, Up Left Fire, Down Right Fire, Down Left + # Fire + + def __str__(self): + return super().__str__() + '({})'.format(self.rom_file) + + def states(self): + width, height = self.environment.getScreenDims() + return dict(type='float', shape=(width, height, 3), min_value=0.0, max_value=1.0) + + def actions(self): + return dict(type='int', num_values=len(self.available_actions)) + + def close(self): + self.environment.__del__() + self.environment = None + + def get_states(self): + # screen = np.copy(self.environment.getScreenRGB(self.screen)) + screen = self.environment.getScreenRGB() + screen = screen.astype(dtype=np.float32) / 255.0 + return screen + + def reset(self): + self.environment.reset_game() + width, height = self.environment.getScreenDims() + # self.screen = np.empty((width, height, 3), dtype=np.uint8) + self.lives = self.environment.lives() + return self.get_states() + + def execute(self, actions): + reward = self.environment.act(self.available_actions[actions]) + terminal = self.environment.game_over() + states = self.get_states() + + next_lives = self.environment.lives() + if next_lives < self.lives: + if self.life_loss_terminal: + terminal = True + elif self.life_loss_punishment > 0.0: + reward -= self.life_loss_punishment + self.lives = next_lives + + return states, terminal, reward diff --git a/tensorforce/environments/carla/env_utils.py b/tensorforce/environments/carla/env_utils.py new file mode 100755 index 000000000..2c050490f --- /dev/null +++ b/tensorforce/environments/carla/env_utils.py @@ -0,0 +1,384 @@ +"""Utility functions for environment.py""" + +import os +import cv2 +import math +import random +import numpy as np +import carla +import pygame +import threading +import datetime + +from typing import Union +from tensorforce.agents import Agent + + +# constants: +epsilon = np.finfo(np.float32).eps + +# Use this dict to convert lanes objects to integers: +WAYPOINT_DICT = dict(lane_change={carla.LaneChange.NONE: 0, + carla.LaneChange.Both: 1, + carla.LaneChange.Left: 2, + carla.LaneChange.Right: 3}, + lane_type={carla.LaneType.NONE: 0, + carla.LaneType.Bidirectional: 1, + carla.LaneType.Biking: 2, + carla.LaneType.Border: 3, + carla.LaneType.Driving: 4, + carla.LaneType.Entry: 5, + carla.LaneType.Exit: 6, + carla.LaneType.Median: 7, + carla.LaneType.OffRamp: 8, + carla.LaneType.OnRamp: 9, + carla.LaneType.Parking: 10, + carla.LaneType.Rail: 11, + carla.LaneType.Restricted: 12, + carla.LaneType.RoadWorks: 13, + carla.LaneType.Shoulder: 14, + carla.LaneType.Sidewalk: 15, + carla.LaneType.Special1: 16, + carla.LaneType.Special2: 17, + carla.LaneType.Special3: 18, + carla.LaneType.Stop: 19, + carla.LaneType.Tram: 20, + carla.LaneType.Any: 21}, + lane_marking_type={carla.LaneMarkingType.NONE: 0, + carla.LaneMarkingType.BottsDots: 1, + carla.LaneMarkingType.Broken: 2, + carla.LaneMarkingType.BrokenBroken: 3, + carla.LaneMarkingType.BrokenSolid: 4, + carla.LaneMarkingType.Curb: 5, + carla.LaneMarkingType.Grass: 6, + carla.LaneMarkingType.Solid: 7, + carla.LaneMarkingType.SolidBroken: 8, + carla.LaneMarkingType.SolidSolid: 9, + carla.LaneMarkingType.Other: 10}, + traffic_light={carla.TrafficLightState.Green: 0, + carla.TrafficLightState.Red: 1, + carla.TrafficLightState.Yellow: 2, + carla.TrafficLightState.Off: 3, + carla.TrafficLightState.Unknown: 4} + ) + + +# ------------------------------------------------------------------------------------------------- +# -- PyGame +# ------------------------------------------------------------------------------------------------- + +def init_pygame(): + if not pygame.get_init(): + pygame.init() + + if not pygame.font.get_init(): + pygame.font.init() + + +def get_display(window_size, mode=pygame.HWSURFACE | pygame.DOUBLEBUF): + """Returns a display used to render images and text. + :param window_size: a tuple (width: int, height: int) + :param mode: pygame rendering mode. Default: pygame.HWSURFACE | pygame.DOUBLEBUF + :return: a pygame.display instance. + """ + return pygame.display.set_mode(window_size, mode) + + +def get_font(size=14): + return pygame.font.Font(pygame.font.get_default_font(), size) + + +def display_image(display, image, window_size=(800, 600), blend=False): + """Displays the given image on a pygame window + :param blend: whether to blend or not the given image. + :param window_size: the size of the pygame's window. Default is (800, 600) + :param display: pygame.display + :param image: the image (numpy.array) to display/render on. + """ + # Resize image if necessary + if (image.shape[1], image.shape[0]) != window_size: + image = resize(image, size=window_size) + + image_surface = pygame.surfarray.make_surface(image.swapaxes(0, 1)) + + if blend: + image_surface.set_alpha(100) + + display.blit(image_surface, (0, 0)) + + +def display_text(display, font, text: [str], color=(255, 255, 255), origin=(0, 0), offset=(0, 2)): + position = origin + + for line in text: + if isinstance(line, dict): + display.blit(font.render(line.get('text'), True, line.get('color', color)), position) + else: + display.blit(font.render(line, True, color), position) + + position = (position[0] + offset[0], position[1] + offset[1]) + + +def pygame_save(display, path: str, name: str = None): + if name is None: + name = 'image-' + str(datetime.datetime.now()) + '.jpg' + + thread = threading.Thread(target=lambda: pygame.image.save(display, os.path.join(path, name))) + thread.start() + + +# ------------------------------------------------------------------------------------------------- +# -- CARLA +# ------------------------------------------------------------------------------------------------- + +def get_client(address, port, timeout=2.0) -> carla.Client: + """Connects to the simulator. + @:returns a carla.Client instance if the CARLA simulator accepts the connection. + """ + client: carla.Client = carla.Client(address, port) + client.set_timeout(timeout) + return client + + +def random_blueprint(world: carla.World, actor_filter='vehicle.*', role_name='agent') -> carla.ActorBlueprint: + """Retrieves a random blueprint. + :param world: a carla.World instance. + :param actor_filter: a string used to filter (select) blueprints. Default: 'vehicle.*' + :param role_name: blueprint's role_name, Default: 'agent'. + :return: a carla.ActorBlueprint instance. + """ + blueprints = world.get_blueprint_library().filter(actor_filter) + blueprint: carla.ActorBlueprint = random.choice(blueprints) + blueprint.set_attribute('role_name', role_name) + + if blueprint.has_attribute('color'): + color = random.choice(blueprint.get_attribute('color').recommended_values) + blueprint.set_attribute('color', color) + + if blueprint.has_attribute('driver_id'): + driver_id = random.choice(blueprint.get_attribute('driver_id').recommended_values) + blueprint.set_attribute('driver_id', driver_id) + + if blueprint.has_attribute('is_invincible'): + blueprint.set_attribute('is_invincible', 'true') + + # set the max speed + if blueprint.has_attribute('speed'): + float(blueprint.get_attribute('speed').recommended_values[1]) + float(blueprint.get_attribute('speed').recommended_values[2]) + else: + print("No recommended values for 'speed' attribute") + + return blueprint + + +def random_spawn_point(world_map: carla.Map, different_from: carla.Location = None) -> carla.Transform: + """Returns a random spawning location. + :param world_map: a carla.Map instance obtained by calling world.get_map() + :param different_from: ensures that the location of the random spawn point is different from the one specified here. + :return: a carla.Transform instance. + """ + available_spawn_points = world_map.get_spawn_points() + + if different_from is not None: + while True: + spawn_point = random.choice(available_spawn_points) + + if spawn_point.location != different_from: + return spawn_point + else: + return random.choice(available_spawn_points) + + +def spawn_actor(world: carla.World, blueprint: carla.ActorBlueprint, spawn_point: carla.Transform, + attach_to: carla.Actor = None, attachment_type=carla.AttachmentType.Rigid) -> carla.Actor: + """Tries to spawn an actor in a CARLA simulator. + :param world: a carla.World instance. + :param blueprint: specifies which actor has to be spawned. + :param spawn_point: where to spawn the actor. A transform specifies the location and rotation. + :param attach_to: whether the spawned actor has to be attached (linked) to another one. + :param attachment_type: the kind of the attachment. Can be 'Rigid' or 'SpringArm'. + :return: a carla.Actor instance. + """ + actor = world.try_spawn_actor(blueprint, spawn_point, attach_to, attachment_type) + + if actor is None: + raise ValueError(f'Cannot spawn actor. Try changing the spawn_point ({spawn_point}) to something else.') + + return actor + + +def get_blueprint(world: carla.World, actor_id: str) -> carla.ActorBlueprint: + return world.get_blueprint_library().find(actor_id) + + +def global_to_local(point: carla.Location, reference: Union[carla.Transform, carla.Location, carla.Rotation]): + """Translates a 3D point from global to local coordinates using the current transformation as reference""" + if isinstance(reference, carla.Transform): + reference.transform(point) + elif isinstance(reference, carla.Location): + carla.Transform(reference, carla.Rotation()).transform(point) + elif isinstance(reference, carla.Rotation): + carla.Transform(carla.Location(), reference).transform(point) + else: + raise ValueError('Argument "reference" is none of carla.Transform or carla.Location or carla.Rotation!') + + +# ------------------------------------------------------------------------------------------------- +# -- Other +# ------------------------------------------------------------------------------------------------- + +def resize(image, size: (int, int), interpolation=cv2.INTER_CUBIC): + """Resize the given image. + :param image: a numpy array with shape (height, width, channels). + :param size: (width, height) to resize the image to. + :param interpolation: Default: cv2.INTER_CUBIC. + :return: the reshaped image. + """ + return cv2.resize(image, dsize=size, interpolation=interpolation) + + +def scale(num, from_interval=(-1.0, +1.0), to_interval=(0.0, 7.0)) -> float: + """Scales (interpolates) the given number to a given interval. + :param num: a number + :param from_interval: the interval the number is assumed to lie in. + :param to_interval: the target interval. + :return: the scaled/interpolated number. + """ + x = np.interp(num, from_interval, to_interval) + return float(round(x)) + + +def cv2_grayscale(image, is_bgr=True, depth=1): + """Convert a RGB or BGR image to grayscale using OpenCV (cv2). + :param image: input image, a numpy.ndarray. + :param is_bgr: tells whether the image is in BGR format. If False, RGB format is assumed. + :param depth: replicates the gray depth channel multiple times. E.g. useful to display grayscale images as rgb. + """ + assert depth >= 1 + + if is_bgr: + grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + grayscale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + + if depth > 1: + return np.stack((grayscale,) * depth, axis=-1) + + return grayscale + + +def save_agent(agent: Agent, agent_name: str, directory: str, separate_dir=True) -> str: + if separate_dir: + save_path = os.path.join(directory, agent_name) + os.makedirs(save_path, exist_ok=True) + else: + save_path = directory + + checkpoint_path = agent.save(directory=save_path, filename=agent_name) + return checkpoint_path + + +def get_record_path(base_dir: str, prefix='ep', pattern='-'): + dirs = sorted(os.listdir(base_dir)) + count = 0 + + if len(dirs) > 0: + count = 1 + int(dirs[-1].split(pattern)[1]) + + record_path = os.path.join(base_dir, f'{prefix}{pattern}{count}') + os.mkdir(record_path) + + return record_path + + +def replace_nans(data: dict, nan=0.0, pos_inf=0.0, neg_inf=0.0): + """In-place replacement of non-numerical values, i.e. NaNs and +/- infinity""" + for key, value in data.items(): + if np.isnan(value).any() or np.isinf(value).any(): + data[key] = np.nan_to_num(value, nan=nan, posinf=pos_inf, neginf=neg_inf) + + return data + + +# ------------------------------------------------------------------------------------------------- +# -- Debug +# ------------------------------------------------------------------------------------------------- + +class Colors(object): + """Wraps some carla.Color instances.""" + red = carla.Color(255, 0, 0) + green = carla.Color(0, 255, 0) + blue = carla.Color(47, 210, 231) + cyan = carla.Color(0, 255, 255) + yellow = carla.Color(255, 255, 0) + orange = carla.Color(255, 162, 0) + white = carla.Color(255, 255, 255) + black = carla.Color(0, 0, 0) + + +def draw_transform(debug, trans, col=Colors.red, lt=-1): + yaw_in_rad = math.radians(trans.rotation.yaw) + pitch_in_rad = math.radians(trans.rotation.pitch) + + p1 = carla.Location(x=trans.location.x + math.cos(pitch_in_rad) * math.cos(yaw_in_rad), + y=trans.location.y + math.cos(pitch_in_rad) * math.sin(yaw_in_rad), + z=trans.location.z + math.sin(pitch_in_rad)) + + debug.draw_arrow(trans.location, p1, thickness=0.05, arrow_size=0.1, color=col, life_time=lt) + + +def draw_radar_measurement(debug_helper: carla.DebugHelper, data: carla.RadarMeasurement, velocity_range=7.5, + size=0.075, life_time=0.06): + """Code adapted from carla/PythonAPI/examples/manual_control.py: + - White: means static points. + - Red: indicates points moving towards the object. + - Blue: denoted points moving away. + """ + radar_rotation = data.transform.rotation + for detection in data: + azimuth = math.degrees(detection.azimuth) + radar_rotation.yaw + altitude = math.degrees(detection.altitude) + radar_rotation.pitch + + # move to local coordinates: + forward_vec = carla.Vector3D(x=detection.depth - 0.25) + global_to_local(forward_vec, + reference=carla.Rotation(pitch=altitude, yaw=azimuth, roll=radar_rotation.roll)) + + # draw: + debug_helper.draw_point(data.transform.location + forward_vec, size=size, life_time=life_time, + persistent_lines=False, color=carla.Color(255, 255, 255)) + + +# ------------------------------------------------------------------------------------------------- +# -- Math +# ------------------------------------------------------------------------------------------------- + +def l2_norm(location1, location2): + """Computes the Euclidean distance between two carla.Location objects.""" + dx = location1.x - location2.x + dy = location1.y - location2.y + dz = location1.z - location2.z + return math.sqrt(dx**2 + dy**2 + dz**2) + epsilon + + +def vector_norm(vec: carla.Vector3D) -> float: + """Returns the norm/magnitude (a scalar) of the given 3D vector.""" + return math.sqrt(vec.x**2 + vec.y**2 + vec.z**2) + + +def speed(actor: carla.Actor) -> float: + """Returns the speed of the given actor in km/h.""" + return 3.6 * vector_norm(actor.get_velocity()) + + +def dot_product(a: carla.Vector3D, b: carla.Vector3D) -> float: + return a.x * b.x + a.y * b.y + a.z * b.z + + +def cosine_similarity(a: carla.Vector3D, b: carla.Vector3D) -> float: + """-1: opposite vectors (pointing in the opposite direction), + 0: orthogonal, + 1: exactly the same (pointing in the same direction) + """ + return dot_product(a, b) / (vector_norm(a) * vector_norm(b)) diff --git a/tensorforce/environments/carla/sensors.py b/tensorforce/environments/carla/sensors.py new file mode 100755 index 000000000..b9427ead2 --- /dev/null +++ b/tensorforce/environments/carla/sensors.py @@ -0,0 +1,417 @@ +"""A collection of sensors helpers.""" + +import math +import numpy as np +import carla + + +class Sensor(object): + """Base class for wrapping sensors.""" + def __init__(self, parent_actor: carla.Actor, transform=carla.Transform(), attachment_type=None, + attributes: dict = None): + self.parent = parent_actor + self.world = self.parent.get_world() + self.attributes = attributes or dict() + self.event_callbacks = [] + + # Look for callback(s) + if 'callback' in self.attributes: + self.event_callbacks.append(self.attributes.pop('callback')) + + elif 'callbacks' in self.attributes: + for callback in self.attributes.pop('callbacks'): + self.event_callbacks.append(callback) + + # detector-sensors retrieve data only when triggered (not at each tick!) + self.sensor, self.is_detector = self._spawn(transform, attachment_type) + + @property + def name(self) -> str: + raise NotImplementedError + + def set_parent_actor(self, actor: carla.Actor): + self.parent = actor + + def add_callback(self, callback): + assert callable(callback) + self.event_callbacks.append(callback) + + def clear_callbacks(self): + self.event_callbacks.clear() + + @staticmethod + def create(sensor_type, **kwargs): + if sensor_type == 'sensor.other.collision': + return CollisionDetector(**kwargs) + + elif sensor_type == 'sensor.other.lane_invasion': + return LaneInvasionSensor(**kwargs) + + elif sensor_type == 'sensor.other.gnss': + return GnssSensor(**kwargs) + + elif sensor_type == 'sensor.other.imu': + return IMUSensor(**kwargs) + + elif sensor_type == 'sensor.camera.rgb': + return RGBCameraSensor(**kwargs) + + elif sensor_type == 'sensor.camera.semantic_segmentation': + return SemanticCameraSensor(**kwargs) + + elif sensor_type == 'sensor.camera.depth': + return DepthCameraSensor(**kwargs) + + elif sensor_type == 'sensor.other.obstacle': + return ObstacleDetector(**kwargs) + + elif sensor_type == 'sensor.lidar.ray_cast': + return LidarSensor(**kwargs) + + elif sensor_type == 'sensor.other.radar': + return RadarSensor(**kwargs) + else: + raise ValueError(f'String `{sensor_type}` does not denote a valid sensor!') + + def start(self): + """Start listening for events""" + if not self.sensor.is_listening: + self.sensor.listen(self.on_event) + else: + print(f'Sensor {self.name} is already been started!') + + def stop(self): + """Stop listening for events""" + self.sensor.stop() + + def _spawn(self, transform, attachment_type=None): + """Spawns itself within a carla.World.""" + if attachment_type is None: + attachment_type = carla.AttachmentType.Rigid + + sensor_bp: carla.ActorBlueprint = self.world.get_blueprint_library().find(self.name) + + for attr, value in self.attributes.items(): + if sensor_bp.has_attribute(attr): + sensor_bp.set_attribute(attr, str(value)) + else: + print(f'Sensor {self.name} has no attribute `{attr}`') + + sensor_actor = self.world.spawn_actor(sensor_bp, transform, self.parent, attachment_type) + is_detector = not sensor_bp.has_attribute('sensor_tick') + + return sensor_actor, is_detector + + def on_event(self, event): + for callback in self.event_callbacks: + callback(event) + + def destroy(self): + if self.sensor is not None: + self.sensor.stop() + self.sensor.destroy() + self.sensor = None + + self.parent = None + self.world = None + + +# ------------------------------------------------------------------------------------------------- +# -- Camera Sensors +# ------------------------------------------------------------------------------------------------- + +class CameraSensor(Sensor): + def __init__(self, color_converter=carla.ColorConverter.Raw, **kwargs): + super().__init__(**kwargs) + self.color_converter = color_converter + + @property + def name(self): + raise NotImplementedError + + def convert_image(self, image: carla.Image, color_converter=None): + color_converter = color_converter or self.color_converter or carla.ColorConverter.Raw + image.convert(color_converter) + + array = np.frombuffer(image.raw_data, dtype=np.uint8) + array = np.reshape(array, (image.height, image.width, 4)) + array = array[:, :, :3] + array = array[:, :, ::-1] + return array + + def save_to_disk(self, image: carla.Image, path: str): + """Saves the carla.Image to disk using its color_converter.""" + assert isinstance(image, carla.Image) + assert isinstance(path, str) + image.save_to_disk(path, color_converter=self.color_converter) + + +class RGBCameraSensor(CameraSensor): + @property + def name(self): + return 'sensor.camera.rgb' + + +class DepthCameraSensor(CameraSensor): + @property + def name(self): + return 'sensor.camera.depth' + + +class SemanticCameraSensor(CameraSensor): + @property + def name(self): + return 'sensor.camera.semantic_segmentation' + + +# ------------------------------------------------------------------------------------------------- +# -- Detector Sensors +# ------------------------------------------------------------------------------------------------- + +class CollisionDetector(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + + @property + def name(self): + return 'sensor.other.collision' + + +class LaneInvasionSensor(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + + @property + def name(self): + return 'sensor.other.lane_invasion' + + +class ObstacleDetector(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + + @property + def name(self): + return 'sensor.other.obstacle' + + +# ------------------------------------------------------------------------------------------------- +# -- Other Sensors +# ------------------------------------------------------------------------------------------------- + +class LidarSensor(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + + @property + def name(self): + return 'sensor.lidar.ray_cast' + + +class RadarSensor(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + + @property + def name(self): + return 'sensor.other.radar' + + @staticmethod + def convert(radar_measurement: carla.RadarMeasurement): + """Converts a carla.RadarMeasurement into a numpy array [[velocity, altitude, azimuth, depth]]""" + points = np.frombuffer(radar_measurement.raw_data, dtype=np.dtype('f4')) + points = np.reshape(points, (len(radar_measurement), 4)) + return points + + +class GnssSensor(Sensor): + def __init__(self, parent_actor, transform=carla.Transform(carla.Location(x=1.0, z=2.8)), **kwargs): + super().__init__(parent_actor, transform=transform, **kwargs) + self.lat = 0.0 + self.lon = 0.0 + + @property + def name(self): + return 'sensor.other.gnss' + + def on_event(self, event): + super().on_event(event) + self.lat = event.latitude + self.lon = event.longitude + + def destroy(self): + super().destroy() + self.lat = None + self.lon = None + + +class IMUSensor(Sensor): + def __init__(self, parent_actor, **kwargs): + super().__init__(parent_actor, **kwargs) + self.accelerometer = (0.0, 0.0, 0.0) + self.gyroscope = (0.0, 0.0, 0.0) + self.compass = 0.0 + + @property + def name(self): + return 'sensor.other.imu' + + def on_event(self, event): + super().on_event(event) + limits = (-99.9, 99.9) + + self.accelerometer = ( + max(limits[0], min(limits[1], event.accelerometer.x)), + max(limits[0], min(limits[1], event.accelerometer.y)), + max(limits[0], min(limits[1], event.accelerometer.z))) + + self.gyroscope = ( + max(limits[0], min(limits[1], math.degrees(event.gyroscope.x))), + max(limits[0], min(limits[1], math.degrees(event.gyroscope.y))), + max(limits[0], min(limits[1], math.degrees(event.gyroscope.z)))) + + self.compass = math.degrees(event.compass) + + def destroy(self): + super().destroy() + self.accelerometer = None + self.gyroscope = None + self.compass = None + + +# ------------------------------------------------------------------------------------------------- +# -- Sensors specifications +# ------------------------------------------------------------------------------------------------- + +class SensorSpecs(object): + ATTACHMENT_TYPE = {'SpringArm': carla.AttachmentType.SpringArm, + 'Rigid': carla.AttachmentType.Rigid, + None: carla.AttachmentType.Rigid} + + COLOR_CONVERTER = {'Raw': carla.ColorConverter.Raw, + 'CityScapesPalette': carla.ColorConverter.CityScapesPalette, + 'Depth': carla.ColorConverter.Depth, + 'LogarithmicDepth': carla.ColorConverter.LogarithmicDepth, + None: carla.ColorConverter.Raw} + + @staticmethod + def get_position(position: str = None) -> carla.Transform: + if position == 'top': + return carla.Transform(carla.Location(x=-5.5, z=2.5), carla.Rotation(pitch=8.0)) + elif position == 'top-view': + return carla.Transform(carla.Location(x=-8.0, z=6.0), carla.Rotation(pitch=6.0)) + elif position == 'front': + return carla.Transform(carla.Location(x=1.5, z=1.8)) + elif position == 'on-top': + return carla.Transform(carla.Location(x=-0.9, y=0.0, z=2.2)) + elif position == 'on-top2': + return carla.Transform(carla.Location(x=0.0, y=0.0, z=2.2)) + elif position == 'radar': + return carla.Transform(carla.Location(x=2.8, z=1.0), carla.Rotation(pitch=5)) + else: + return carla.Transform() + + @staticmethod + def set(sensor_spec: dict, **kwargs): + for key, value in kwargs.items(): + if key == 'position': + sensor_spec['transform'] = SensorSpecs.get_position(value) + elif key == 'attachment_type': + sensor_spec[key] = SensorSpecs.ATTACHMENT_TYPE[value] + elif key == 'color_converter': + sensor_spec[key] = SensorSpecs.COLOR_CONVERTER[value] + + @staticmethod + def add_callback(sensor_spec: dict, callback): + assert callable(callback) + assert isinstance(sensor_spec, dict) + + attributes = sensor_spec.get('attributes', dict()) + + if 'callback' in attributes: + attributes['callbacks'] = [callback, attributes.pop('callback')] + + elif 'callbacks' in attributes: + attributes['callbacks'].append(callback) + else: + attributes['callback'] = callback + + sensor_spec['attributes'] = attributes + + @staticmethod + def set_color_converter(camera_spec: dict, color_converter: str = None): + camera_spec['color_converter'] = SensorSpecs.COLOR_CONVERTER[color_converter] + return SensorSpecs + + @staticmethod + def camera(kind: str, transform: carla.Transform = None, position: str = None, attachment_type=None, + color_converter=None, **kwargs) -> dict: + assert kind in ['rgb', 'depth', 'semantic_segmentation'] + return dict(type='sensor.camera.' + kind, + transform=transform or SensorSpecs.get_position(position), + attachment_type=SensorSpecs.ATTACHMENT_TYPE[attachment_type], + color_converter=SensorSpecs.COLOR_CONVERTER[color_converter], + attributes=kwargs) + + @staticmethod + def rgb_camera(transform: carla.Transform = None, position: str = None, attachment_type='SpringArm', + color_converter='Raw', **kwargs): + return SensorSpecs.camera('rgb', transform, position, attachment_type, color_converter, **kwargs) + + @staticmethod + def depth_camera(transform: carla.Transform = None, position: str = None, attachment_type='SpringArm', + color_converter='LogarithmicDepth', **kwargs): + return SensorSpecs.camera('depth', transform, position, attachment_type, color_converter, **kwargs) + + @staticmethod + def segmentation_camera(transform: carla.Transform = None, position: str = None, attachment_type='SpringArm', + color_converter='CityScapesPalette', **kwargs): + return SensorSpecs.camera('semantic_segmentation', transform, position, attachment_type, color_converter, **kwargs) + + @staticmethod + def detector(kind: str, transform: carla.Transform = None, position: str = None, attachment_type=None, + **kwargs) -> dict: + assert kind in ['collision', 'lane_invasion', 'obstacle'] + return dict(type='sensor.other.' + kind, + transform=transform or SensorSpecs.get_position(position), + attachment_type=SensorSpecs.ATTACHMENT_TYPE[attachment_type], + attributes=kwargs) + + @staticmethod + def collision_detector(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.detector('collision', transform, position, attachment_type, **kwargs) + + @staticmethod + def lane_detector(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.detector('lane_invasion', transform, position, attachment_type, **kwargs) + + @staticmethod + def obstacle_detector(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.detector('obstacle', transform, position, attachment_type, **kwargs) + + @staticmethod + def other(kind: str, transform: carla.Transform = None, position: str = None, attachment_type=None, **kwargs) -> dict: + assert kind in ['imu', 'gnss', 'radar'] + return dict(type='sensor.other.' + kind, + transform=transform or SensorSpecs.get_position(position), + attachment_type=SensorSpecs.ATTACHMENT_TYPE[attachment_type], + attributes=kwargs) + + @staticmethod + def lidar(transform: carla.Transform = None, position: str = None, attachment_type=None, **kwargs) -> dict: + return dict(type='sensor.lidar.ray_cast', + transform=transform or SensorSpecs.get_position(position), + attachment_type=SensorSpecs.ATTACHMENT_TYPE[attachment_type], + attributes=kwargs) + + @staticmethod + def radar(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.other('radar', transform, position, attachment_type, **kwargs) + + @staticmethod + def imu(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.other('imu', transform, position, attachment_type, **kwargs) + + @staticmethod + def gnss(transform: carla.Transform = None, position: str = None, attachment_type='Rigid', **kwargs): + return SensorSpecs.other('imu', transform, position, attachment_type, **kwargs) diff --git a/tensorforce/environments/carla/synchronous_mode.py b/tensorforce/environments/carla/synchronous_mode.py new file mode 100644 index 000000000..848e30141 --- /dev/null +++ b/tensorforce/environments/carla/synchronous_mode.py @@ -0,0 +1,86 @@ +import carla +import queue + + +class CARLASyncContext(object): + """ + Context manager to synchronize output from different sensors. Synchronous + mode is enabled as long as we are inside this context + + with CARLASyncContext(world, sensors) as sync_mode: + while True: + data = sync_mode.tick(timeout=1.0) + + This code is based on https://github.com/carla-simulator/carla/blob/master/PythonAPI/examples/synchronous_mode.py + """ + + def __init__(self, world, sensors: dict, fps=30): + self.world = world + self.sensors = sensors + self.frame = None + self.delta_seconds = 1.0 / fps + self._settings = None + + # Make a queue for each sensor and for world: + self._queues = dict() + self._add_queue('world', self.world.on_tick) + + for name, sensor in self.sensors.items(): + self._add_queue(name, sensor.add_callback) + + def __enter__(self): + self._settings = self.world.get_settings() + self.frame = self.world.apply_settings(carla.WorldSettings( + no_rendering_mode=False, + fixed_delta_seconds=self.delta_seconds, + synchronous_mode=True)) + + for sensor in self.sensors.values(): + sensor.start() + + return self + + def __exit__(self, *args, **kwargs): + self.world.apply_settings(self._settings) + + for sensor in self.sensors.values(): + sensor.stop() + + def tick(self, timeout): + self.frame = self.world.tick() + + data = dict() + for name, q in self._queues.items(): + + if name != 'world' and self.sensors[name].is_detector: + # Detectors retrieve data only when triggered so have to not wait + data[name] = self._get_detector_data(q) + else: + # Cameras + other are sensors that retrieve data at every simulation step + data[name] = self._get_sensor_data(q, timeout) + + return data + + def _add_queue(self, name, register_event): + """Registers an even on its own queue identified by name""" + q = queue.Queue() + register_event(q.put) + self._queues[name] = q + + @staticmethod + def _get_detector_data(sensor_queue: queue.Queue): + """Retrieves data for detector, the call is non-blocking thus doesn't wait for available data.""" + data = [] + + while not sensor_queue.empty(): + data.append(sensor_queue.get_nowait()) + + return data + + def _get_sensor_data(self, sensor_queue: queue.Queue, timeout: float): + """Retrieves data for sensors (i.e. camera and other) it blocks waiting until timeout is expired.""" + while True: + data = sensor_queue.get(timeout=timeout) + + if data.frame == self.frame: + return data diff --git a/tensorforce/environments/carla_environment.py b/tensorforce/environments/carla_environment.py new file mode 100755 index 000000000..ec289b1c2 --- /dev/null +++ b/tensorforce/environments/carla_environment.py @@ -0,0 +1,446 @@ +import math +import numpy as np + +from datetime import datetime + +from tensorforce.environments import Environment + +try: + import carla + import pygame + + from tensorforce.environments.carla import env_utils + from tensorforce.environments.carla.env_utils import WAYPOINT_DICT + from tensorforce.environments.carla.sensors import Sensor, SensorSpecs + from tensorforce.environments.carla.synchronous_mode import CARLASyncContext +except ImportError: + pass + + +class CARLAEnvironment(Environment): + """A TensorForce Environment for the [CARLA driving simulator](https://github.com/carla-simulator/carla). + - This environment is "synchronized" with the server, meaning that the server waits for a client tick. For a + detailed explanation of this, please refer to https://carla.readthedocs.io/en/latest/adv_synchrony_timestep/. + - Subclass to customize the behaviour of states, actions, sensors, reward function, agent, training loop, etc. + + Requires, you to: + - Install `pygame`, `opencv` + - Install the CARLA simulator (version >= 0.9.8): https://carla.readthedocs.io/en/latest/start_quickstart + - Install CARLA's Python bindings: + --> Follow this [guide](https://carla.readthedocs.io/en/latest/build_system/#pythonapi), if you have trouble + with that then follow the above steps. + --> `cd your-path-to/CARLA_0.9.x/PythonAPI/carla/dist/` + --> Extract `carla-0.9.x-py3.5-YOUR_OS-x86_64.egg` where `YOUR_OS` depends on your OS, i.e. `linux` or `windows` + --> Create a `setup.py` file within the extracted folder and write the following: + ```python + from distutils.core import setup + + setup(name='carla', + version='0.9.x', + py_modules=['carla']) + ``` + --> Install via pip: `pip install -e ~/CARLA_0.9.x/PythonAPI/carla/dist/carla-0.9.x-py3.5-YOUR_OS-x86_64` + - Run the CARLA simulator from command line: `your-path-to/CARLA_0.9.x/./CarlaUE4.sh` or (CarlaUE4.exe) + --> To use less resources add these flags: `-windowed -ResX=8 -ResY=8 --quality-level=Low` + + Hardware requirements (recommended): + - GPU: dedicated, with at least 2/4 GB. + - RAM: 16 GB suggested. + - CPU: multicore, at least 4. + - Note: on my hardware (i7 4700HQ 4C/8T, GT 750M 4GB, 16GB RAM) I achieve about 20 FPS in low quality mode. + + Example usage: + - See [tensorforce/examples](https://github.com/tensorforce/tensorforce/tree/master/examples) + + Known Issues: + - TensorForce's Runner is currently not compatible with this environment! + + Author: + - Luca Anzalone (@luca96) + """ + # States and actions specifications: + # Actions: throttle or brake, steer, reverse (bool) + ACTIONS_SPEC = dict(type='float', shape=(3,), min_value=-1.0, max_value=1.0) + DEFAULT_ACTIONS = np.array([0.0, 0.0, 0.0]) + + # Vehicle: speed, control (4), accelerometer (x, y, z), gyroscope (x, y, z), position (x, y), compass + VEHICLE_FEATURES_SPEC = dict(type='float', shape=(14,)) + + # Road: intersection (bool), junction (bool), speed_limit, lane_width, lane_change, left_lane, right_lane + ROAD_FEATURES_SPEC = dict(type='float', shape=(8,)) + + # TODO: add a loading map functionality (specified or at random) - load_map + def __init__(self, address='localhost', port=2000, timeout=2.0, image_shape=(150, 200, 3), window_size=(800, 600), + vehicle_filter='vehicle.*', sensors: dict = None, route_resolution=2.0, fps=30.0, render=True, + debug=False): + """ + :param address: CARLA simulator's id address. Required only if the simulator runs on a different machine. + :param port: CARLA simulator's port. + :param timeout: connection timeout. + :param image_shape: shape of the images observations. + :param window_size: pygame's window size. Meaningful only if `visualize=True`. + :param vehicle_filter: use to spawn a particular vehicle (e.g. 'vehicle.tesla.model3') or class of vehicles + (e.g. 'vehicle.audi.*') + :param sensors: specifies which sensors should be equipped to the vehicle, better specified by subclassing + `default_sensors()`. + :param route_resolution: route planner resolution grain. + :param fps: maximum framerate, it depends on your compiting power. + :param render: if True a pygame window is shown. + :param debug: enable to display some useful information about the vehicle. + """ + super().__init__() + env_utils.init_pygame() + + self.timeout = timeout + self.client = env_utils.get_client(address, port, self.timeout) + self.world = self.client.get_world() # type: carla.World + self.map = self.world.get_map() # type: carla.Map + self.synchronous_context = None + + # set fix fps: + self.world.apply_settings(carla.WorldSettings( + no_rendering_mode=False, + synchronous_mode=False, + fixed_delta_seconds=1.0 / fps)) + + # vehicle + self.vehicle_filter = vehicle_filter + self.vehicle = None # type: carla.Vehicle + + # actions + self.control = None # type: carla.VehicleControl + self.prev_actions = None + + # weather + # TODO: add weather support + + # visualization and debugging stuff + self.image_shape = image_shape + self.image_size = (image_shape[1], image_shape[0]) + self.DEFAULT_IMAGE = np.zeros(shape=self.image_shape, dtype=np.float32) + self.fps = fps + self.tick_time = 1.0 / self.fps + self.should_render = render + self.should_debug = debug + self.clock = pygame.time.Clock() + + if self.should_render: + self.window_size = window_size + self.font = env_utils.get_font(size=13) + self.display = env_utils.get_display(window_size) + + # variables for reward computation + self.collision_penalty = 0.0 + + # vehicle sensors suite + self.sensors_spec = sensors if isinstance(sensors, dict) else self.default_sensors() + self.sensors = dict() + + def states(self): + return dict(image=dict(shape=self.image_shape), + vehicle_features=self.VEHICLE_FEATURES_SPEC, + road_features=self.ROAD_FEATURES_SPEC, + previous_actions=self.ACTIONS_SPEC) + + def actions(self): + return self.ACTIONS_SPEC + + def reset(self, soft=False): + self._reset_world(soft=soft) + + # reset actions + self.control = carla.VehicleControl() + self.prev_actions = self.DEFAULT_ACTIONS + + observation = env_utils.replace_nans(self._get_observation(sensors_data={})) + return observation + + def reward(self, actions, time_cost=-1.0, a=2.0): + """An example reward function. Subclass to define your own.""" + speed = env_utils.speed(self.vehicle) + speed_limit = self.vehicle.get_speed_limit() + + if speed <= speed_limit: + speed_penalty = 0.0 + else: + speed_penalty = a * (speed_limit - speed) + + return time_cost - self.collision_penalty + speed_penalty + + def execute(self, actions, record_path: str = None): + self.prev_actions = actions + + pygame.event.get() + self.clock.tick() + + sensors_data = self.world_step(actions, record_path=record_path) + + reward = self.reward(actions) + terminal = self.terminal_condition() + next_state = env_utils.replace_nans(self._get_observation(sensors_data)) + + self.collision_penalty = 0.0 + + return next_state, terminal, reward + + def terminal_condition(self): + """Tells whether the episode is terminated or not. Override with your own termination condition.""" + return False + + def close(self): + super().close() + + if self.vehicle: + self.vehicle.destroy() + + for sensor in self.sensors.values(): + sensor.destroy() + + def train(self, agent, num_episodes: int, max_episode_timesteps: int, weights_dir='weights/agents', + agent_name='carla-agent', load_agent=False, record_dir='data/recordings', skip_frames=25): + record_path = None + should_record = isinstance(record_dir, str) + should_save = isinstance(weights_dir, str) + + if agent is None: + print(f'Using default agent...') + agent = self.default_agent(max_episode_timesteps=max_episode_timesteps) + + try: + if load_agent: + agent.load(directory=os.path.join(weights_dir, agent_name), filename=agent_name, environment=self, + format='tensorflow') + print('Agent loaded.') + + for episode in range(num_episodes): + states = self.reset() + total_reward = 0.0 + + if should_record: + record_path = env_utils.get_record_path(base_dir=record_dir) + print(f'Recording in {record_path}.') + + with self.synchronous_context: + self.skip(num_frames=skip_frames) + t0 = datetime.now() + + for i in range(max_episode_timesteps): + actions = agent.act(states) + states, terminal, reward = self.execute(actions, record_path=record_path) + + total_reward += reward + terminal = terminal or (i == max_episode_timesteps - 1) + + if agent.observe(reward, terminal): + print(f'{i + 1}/{max_episode_timesteps} -> update performed.') + + if terminal: + elapsed = str(datetime.now() - t0).split('.')[0] + print(f'Episode-{episode} completed in {elapsed}, total_reward: {round(total_reward, 2)}\n') + break + + if should_save: + env_utils.save_agent(agent, agent_name, directory=weights_dir) + print('Agent saved.') + finally: + self.close() + + def default_sensors(self) -> dict: + """Returns a predefined dict of sensors specifications""" + return dict(imu=SensorSpecs.imu(), + collision=SensorSpecs.collision_detector(callback=self.on_collision), + camera=SensorSpecs.rgb_camera(position='top', + image_size_x=self.window_size[0], image_size_y=self.window_size[1], + sensor_tick=self.tick_time)) + + def default_agent(self, **kwargs): + """Returns a predefined agent for this environment""" + raise NotImplementedError('Implement this to define your own default agent!') + + def on_collision(self, event, penalty=1000.0): + impulse = math.sqrt(utils.vector_norm(event.normal_impulse)) + actor_type = event.other_actor.type_id + + if 'pedestrian' in actor_type: + self.collision_penalty += penalty * impulse + + elif 'vehicle' in actor_type: + self.collision_penalty += penalty / 2.0 * impulse + else: + self.collision_penalty += penalty * impulse + + def render(self, sensors_data: dict): + """Renders sensors' output""" + image = sensors_data['camera'] + env_utils.display_image(self.display, image, window_size=self.window_size) + + def debug(self, actions): + env_utils.display_text(self.display, self.font, text=self.debug_text(actions), origin=(16, 12), + offset=(0, 16)) + + def debug_text(self, actions): + return ['%d FPS' % self.clock.get_fps(), + '', + 'Throttle: %.2f' % self.control.throttle, + 'Steer: %.2f' % self.control.steer, + 'Brake: %.2f' % self.control.brake, + 'Reverse: %s' % ('T' if self.control.reverse else 'F'), + 'Hand brake: %s' % ('T' if self.control.hand_brake else 'F'), + 'Gear: %s' % {-1: 'R', 0: 'N'}.get(self.control.gear), + '', + 'Speed %.1f km/h' % env_utils.speed(self.vehicle), + 'Speed limit %.1f km/h' % self.vehicle.get_speed_limit(), + '', + 'Reward: %.2f' % self.reward(actions), + 'Collision penalty: %.2f' % self.collision_penalty] + + def skip(self, num_frames=10): + """Skips the given amount of frames""" + for _ in range(num_frames): + self.synchronous_context.tick(timeout=self.timeout) + + if num_frames > 0: + print(f'Skipped {num_frames} frames.') + + def before_world_step(self): + """Callback: called before world.tick()""" + pass + + def after_world_step(self, sensors_data: dict): + """Callback: called after world.tick().""" + pass + + def on_sensors_data(self, data: dict) -> dict: + """Callback. Triggers when a world's 'tick' occurs, meaning that data from sensors are been collected because a + simulation step of the CARLA's world has been completed. + - Use this method to preprocess sensors' output data for: rendering, observation, ... + """ + data['camera'] = self.sensors['camera'].convert_image(data['camera']) + return data + + def world_step(self, actions, record_path: str = None): + """Applies the actions to the vehicle, and updates the CARLA's world""" + # [pre-tick updates] Apply control to update the vehicle + self.actions_to_control(actions) + self.vehicle.apply_control(self.control) + + self.before_world_step() + + # Advance the simulation and wait for sensors' data. + data = self.synchronous_context.tick(timeout=self.timeout) + data = self.on_sensors_data(data) + + # [post-tick updates] Update world-related stuff + self.after_world_step(data) + + # Draw and debug: + if self.should_render: + self.render(sensors_data=data) + + if self.should_debug: + self.debug(actions) + + pygame.display.flip() + + if isinstance(record_path, str): + env_utils.pygame_save(self.display, record_path) + + return data + + def _reset_world(self, soft=False): + # init actor + if not soft: + spawn_point = env_utils.random_spawn_point(self.map) + else: + spawn_point = self.spawn_point + + if self.vehicle is None: + blueprint = env_utils.random_blueprint(self.world, actor_filter=self.vehicle_filter) + self.vehicle = env_utils.spawn_actor(self.world, blueprint, spawn_point) # type: carla.Vehicle + + self._create_sensors() + self.synchronous_context = CARLASyncContext(self.world, self.sensors, fps=self.fps) + else: + self.vehicle.apply_control(carla.VehicleControl()) + self.vehicle.set_velocity(carla.Vector3D(x=0.0, y=0.0, z=0.0)) + self.vehicle.set_transform(spawn_point) + + # reset reward variables + self.collision_penalty = 0.0 + + def actions_to_control(self, actions): + """Specifies the mapping between an actions vector and the vehicle's control.""" + # throttle and brake are mutual exclusive: + self.control.throttle = float(actions[0]) if actions[0] > 0 else 0.0 + self.control.brake = float(-actions[0]) if actions[0] < 0 else 0.0 + + # steering + self.control.steer = float(actions[1]) + + # reverse motion: + self.control.reverse = bool(actions[2] > 0) + + def _get_observation(self, sensors_data: dict): + image = sensors_data.get('camera', self.DEFAULT_IMAGE) + + if image.shape != self.image_shape: + image = env_utils.resize(image, size=self.image_size) + + # Normalize image's pixels value to -1, +1 + observation = dict(image=(2 * image - 255.0) / 255.0, + vehicle_features=self._get_vehicle_features(), + road_features=self._get_road_features(), + previous_actions=self.prev_actions) + return observation + + def _get_vehicle_features(self): + t = self.vehicle.get_transform() + control = self.vehicle.get_control() + + imu_sensor = self.sensors['imu'] + gyroscope = imu_sensor.gyroscope + accelerometer = imu_sensor.accelerometer + + return [ + env_utils.speed(self.vehicle), + control.gear, + control.steer, + control.throttle, + control.brake, + # Accelerometer: + accelerometer[0], + accelerometer[1], + accelerometer[2], + # Gyroscope: + gyroscope[0], + gyroscope[1], + gyroscope[2], + # Location + t.location.x, + t.location.y, + # Compass: + math.radians(imu_sensor.compass)] + + def _get_road_features(self): + waypoint = self.map.get_waypoint(self.vehicle.get_location()) + speed_limit = self.vehicle.get_speed_limit() + + return [float(waypoint.is_intersection), + float(waypoint.is_junction), + waypoint.lane_width, + math.log2(speed_limit), + # Lane: + WAYPOINT_DICT['lane_type'][waypoint.lane_type], + WAYPOINT_DICT['lane_change'][waypoint.lane_change], + WAYPOINT_DICT['lane_marking_type'][waypoint.left_lane_marking.type], + WAYPOINT_DICT['lane_marking_type'][waypoint.right_lane_marking.type]] + + def _create_sensors(self): + for name, args in self.sensors_spec.items(): + kwargs = args.copy() + sensor = Sensor.create(sensor_type=kwargs.pop('type'), parent_actor=self.vehicle, **kwargs) + + if name == 'world': + raise ValueError(f'Cannot name a sensor `world` because is reserved.') + + self.sensors[name] = sensor diff --git a/tensorforce/environments/cartpole.py b/tensorforce/environments/cartpole.py new file mode 100644 index 000000000..f234a2c22 --- /dev/null +++ b/tensorforce/environments/cartpole.py @@ -0,0 +1,203 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np + +from tensorforce.environments import Environment + + +class CartPole(Environment): + """ + Based on OpenAI Gym version + (https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py) + """ + + def __init__( + self, + # Physics parameters + pole_mass=(0.05, 0.5), # 0.1 + pole_length=(0.1, 1.0), # 0.5, actually half the pole's length + cart_mass=(0.5, 1.5), + relative_force=(0.75, 1.5), # 1.0, relative to gravity + gravity=9.8, + # State space + state_velocities=True, + state_location=False, # true + state_initial_max_angle=0.05, + state_initial_max_angle_velocity=0.05, + state_initial_max_velocity=0.05, + # Action space + action_timedelta=0.02, + action_continuous=False, + action_noop=True # false + ): + super().__init__() + + # Physics parameters + if isinstance(pole_mass, tuple): + assert len(pole_mass) == 2 and 0.0 < pole_mass[0] < pole_mass[1] + self._pole_mass_range = (float(pole_mass[0]), float(pole_mass[1])) + else: + assert pole_mass > 0.0 + self._pole_mass_range = (float(pole_mass), float(pole_mass)) + if isinstance(pole_length, tuple): + assert len(pole_length) == 2 and 0.0 < pole_length[0] < pole_length[1] + self._pole_length_range = (float(pole_length[0]), float(pole_length[1])) + else: + assert pole_length > 0.0 + self._pole_length_range = (float(pole_length), float(pole_length)) + if isinstance(cart_mass, tuple): + assert len(cart_mass) == 2 and 0.0 < cart_mass[0] < cart_mass[1] + self._cart_mass_range = (float(cart_mass[0]), float(cart_mass[1])) + else: + assert cart_mass > 0.0 + self._cart_mass_range = (float(cart_mass), float(cart_mass)) + if isinstance(relative_force, tuple): + assert len(relative_force) == 2 and 0.0 < relative_force[0] < relative_force[1] + self._relative_force_range = (float(relative_force[0]), float(relative_force[1])) + else: + assert relative_force > 0.0 + self._relative_force_range = (float(relative_force), float(relative_force)) + assert gravity > 0.0 + self._gravity = float(gravity) + + # State space + state_indices = [2] + self._state_velocities = bool(state_velocities) + if self._state_velocities: + state_indices.append(3) + state_indices.append(1) + self._state_location = bool(state_location) + if self._state_location: + state_indices.append(0) + self._state_indices = np.array(state_indices, np.int32) + self._state_initials = np.array([[ + 0.0, float(state_initial_max_velocity), + float(state_initial_max_angle), float(state_initial_max_angle_velocity) + ]], dtype=np.float32) + + # Action space + self._action_timedelta = float(action_timedelta) # in seconds + assert not action_continuous or action_noop + self._action_continuous = bool(action_continuous) + self._action_noop = bool(action_noop) + + # State bounds + angle_bound = float(np.pi) / 4.0 + max_angle_acc_in_zero = self._relative_force_range[1] * self._gravity / \ + (self._cart_mass_range[0] + self._pole_mass_range[0]) / \ + self._pole_length_range[0] / \ + (4.0 / 3.0 - self._pole_mass_range[1] / (self._cart_mass_range[0] + self._pole_mass_range[0])) + min_angle_acc_in_zero = self._relative_force_range[0] * self._gravity / \ + (self._cart_mass_range[1] + self._pole_mass_range[1]) / \ + self._pole_length_range[1] / \ + (4.0 / 3.0 - self._pole_mass_range[0] / (self._cart_mass_range[1] + self._pole_mass_range[1])) + max_loc_acc_in_zero = (self._relative_force_range[1] * self._gravity - \ + self._pole_mass_range[0] * self._pole_length_range[0] * min_angle_acc_in_zero) / \ + (self._cart_mass_range[0] + self._pole_mass_range[0]) + angle_vel_bound = max_angle_acc_in_zero * self._action_timedelta * 10.0 + loc_vel_bound = max_loc_acc_in_zero * self._action_timedelta * 10.0 + if self._state_location: + loc_bound = loc_vel_bound + else: + loc_bound = np.inf + self._state_bounds = np.array( + [[loc_bound, loc_vel_bound, angle_bound, angle_vel_bound]], dtype=np.float32 + ) + assert (self._state_bounds > 0.0).all() + + def states(self): + return dict( + type='float', shape=tuple(self._state_indices.shape), + min_value=-self._state_bounds[0, self._state_indices], + max_value=self._state_bounds[0, self._state_indices] + ) + + def actions(self): + if self._action_continuous: + return dict(type='float', shape=()) + elif self._action_noop: + return dict(type='int', shape=(), num_values=3) + else: + return dict(type='int', shape=(), num_values=2) + + def is_vectorizable(self): + return True + + def reset(self, num_parallel=None): + self._is_parallel = (num_parallel is not None) + if self._is_parallel: + self._parallel_indices = np.arange(num_parallel) + else: + self._parallel_indices = np.arange(1) + + # Physics parameters + self._pole_mass = float(np.random.uniform(low=self._pole_mass_range[0], high=self._pole_mass_range[1])) + self._pole_length = float(np.random.uniform(low=self._pole_length_range[0], high=self._pole_length_range[1])) + self._cart_mass = float(np.random.uniform(low=self._cart_mass_range[0], high=self._cart_mass_range[1])) + self._relative_force = float(np.random.uniform(low=self._relative_force_range[0], high=self._relative_force_range[1])) + + # Initialize state + initials = np.tile(self._state_initials, reps=(self._parallel_indices.shape[0], 1)) + self._states = np.random.uniform(low=-initials, high=initials) + + if self._is_parallel: + return self._parallel_indices.copy(), self._states[:, self._state_indices] + else: + return self._states[0, self._state_indices] + + def execute(self, actions): + if not self._is_parallel: + actions = np.expand_dims(actions, axis=0) + + # Split state into components + loc = self._states[:, 0] + loc_vel = self._states[:, 1] + angle = self._states[:, 2] + angle_vel = self._states[:, 3] + + # Make action continuous + if self._action_continuous: + force = actions + else: + force = np.where(actions == 2, 0.0, np.where(actions == 1, 1.0, -1.0)) + force *= self._relative_force * self._gravity + + # Compute accelerations (https://coneural.org/florian/papers/05_cart_pole.pdf) + cos_angle = np.cos(angle) + sin_angle = np.sin(angle) + total_mass = self._cart_mass + self._pole_mass + pole_mass_length = self._pole_mass * self._pole_length + bracket = (force + pole_mass_length * angle_vel * angle_vel * sin_angle) / total_mass + denom = self._pole_length * (4.0 / 3.0 - (self._pole_mass * cos_angle * cos_angle) / total_mass) + angle_acc = (self._gravity * sin_angle - cos_angle * bracket) / denom + loc_acc = bracket - pole_mass_length * angle_acc * cos_angle / total_mass + + # Integration + deriv = np.stack([loc_vel, loc_acc, angle_vel, angle_acc], axis=1) + self._states += self._action_timedelta * deriv + + # Terminal + terminal = (np.abs(self._states) > self._state_bounds).any(axis=1) + + # Reward + reward = np.ones_like(terminal, dtype=np.float32) + + if self._is_parallel: + self._parallel_indices = self._parallel_indices[~terminal] + self._states = self._states[~terminal] + return self._parallel_indices.copy(), self._states[:, self._state_indices], terminal, reward + else: + return self._states[0, self._state_indices], terminal.item(), reward.item() diff --git a/tensorforce/environments/environment.py b/tensorforce/environments/environment.py index 785f3b557..f84af96eb 100644 --- a/tensorforce/environments/environment.py +++ b/tensorforce/environments/environment.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,90 +13,904 @@ # limitations under the License. # ============================================================================== +import json +import math +import os +import random +import sys +from threading import Thread +import time +from traceback import format_tb -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +import numpy as np +from tensorforce import TensorforceError, util +from tensorforce.core import TensorSpec, TensorsSpec +from tensorforce.core.utils.dicts import ArrayDict import tensorforce.environments -import tensorforce.util class Environment(object): """ - Base environment class. + Tensorforce environment interface. """ + @staticmethod + def create( + environment=None, max_episode_timesteps=None, reward_shaping=None, + remote=None, blocking=False, host=None, port=None, **kwargs + ): + """ + Creates an environment from a specification. In case of "socket-server" remote mode, runs + environment in server communication loop until closed. + + Args: + environment (specification | Environment class/object): JSON file, specification key, + configuration dictionary, library module, `Environment` class/object, or gym.Env + (required, invalid for "socket-client" + remote mode). + max_episode_timesteps (int > 0): Maximum number of timesteps per episode, overwrites + the environment default if defined + (default: environment default, invalid + for "socket-client" remote mode). + reward_shaping (callable[(s,a,t,r,s') -> r|(r,t)] | str): Reward shaping function + mapping state, action, terminal, reward and next state to shaped reward and + terminal, or a string expression with arguments "states", "actions", "terminal", + "reward" and "next_states", e.g. "-1.0 if terminal else max(reward, 0.0)" + (default: no reward shaping). + remote ("multiprocessing" | "socket-client" | "socket-server"): Communication mode for + remote environment execution of parallelized environment execution, "socket-client" + mode requires a corresponding "socket-server" running, and "socket-server" mode + runs environment in server communication loop until closed + (default: local execution). + blocking (bool): Whether remote environment calls should be blocking + (default: not blocking, invalid unless + "multiprocessing" or "socket-client" remote mode). + host (str): Socket server hostname or IP address + (required only for "socket-client" remote + mode). + port (int): Socket server port + (required only for "socket-client/server" + remote mode). + kwargs: Additional arguments. + """ + if remote not in ('multiprocessing', 'socket-client'): + if blocking: + raise TensorforceError.invalid( + name='Environment.create', argument='blocking', + condition='no multiprocessing/socket-client instance' + ) + if remote not in ('socket-client', 'socket-server'): + if host is not None: + raise TensorforceError.invalid( + name='Environment.create', argument='host', condition='no socket instance' + ) + elif port is not None: + raise TensorforceError.invalid( + name='Environment.create', argument='port', condition='no socket instance' + ) + + if remote == 'multiprocessing': + from tensorforce.environments import MultiprocessingEnvironment + environment = MultiprocessingEnvironment( + blocking=blocking, environment=environment, + max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs + ) + return environment + + elif remote == 'socket-client': + if environment is not None: + raise TensorforceError.invalid( + name='Environment.create', argument='environment', + condition='socket-client instance' + ) + elif max_episode_timesteps is not None: + raise TensorforceError.invalid( + name='Environment.create', argument='max_episode_timesteps', + condition='socket-client instance' + ) + elif len(kwargs) > 0: + raise TensorforceError.invalid( + name='Environment.create', argument='kwargs', + condition='socket-client instance' + ) + from tensorforce.environments import SocketEnvironment + environment = SocketEnvironment(host=host, port=port, blocking=blocking) + return environment + + elif remote == 'socket-server': + from tensorforce.environments import SocketEnvironment + SocketEnvironment.remote( + port=port, environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + elif remote is not None: + raise TensorforceError.value(name='Environment.create', argument='remote', value=remote) + + elif isinstance(environment, (EnvironmentWrapper, RemoteEnvironment)): + if max_episode_timesteps is not None and \ + max_episode_timesteps != environment.max_episode_timesteps(): + raise TensorforceError( + message='Environment argument max_episode_timesteps has been specified twice ' + 'with different values: {} != {}.'.format( + max_episode_timesteps, environment.max_episode_timesteps() + ) + ) + if len(kwargs) > 0: + raise TensorforceError.invalid( + name='Environment.create', argument='kwargs', + condition='EnvironmentWrapper instance' + ) + return environment + + elif isinstance(environment, type) and \ + issubclass(environment, (EnvironmentWrapper, RemoteEnvironment)): + raise TensorforceError.type( + name='Environment.create', argument='environment', dtype=type(environment) + ) + + elif isinstance(environment, Environment): + return EnvironmentWrapper( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping + ) + + elif isinstance(environment, type) and issubclass(environment, Environment): + environment = environment(**kwargs) + assert isinstance(environment, Environment) + return Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping + ) + + elif isinstance(environment, dict): + # Dictionary specification + util.deep_disjoint_update(target=kwargs, source=environment) + environment = kwargs.pop('environment', kwargs.pop('type', 'default')) + assert environment is not None + if max_episode_timesteps is None: + max_episode_timesteps = kwargs.pop('max_episode_timesteps', None) + if reward_shaping is None: + reward_shaping = kwargs.pop('reward_shaping', None) + + return Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + elif isinstance(environment, str): + if os.path.isfile(environment): + # JSON file specification + with open(environment, 'r') as fp: + environment = json.load(fp=fp) + + util.deep_disjoint_update(target=kwargs, source=environment) + environment = kwargs.pop('environment', kwargs.pop('type', 'default')) + assert environment is not None + if max_episode_timesteps is None: + max_episode_timesteps = kwargs.pop('max_episode_timesteps', None) + if reward_shaping is None: + reward_shaping = kwargs.pop('reward_shaping', None) + + return Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + elif environment in tensorforce.environments.environments: + # Keyword specification + environment = tensorforce.environments.environments[environment] + return Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + else: + # Library specification + import gym + _environment = util.try_import_module( + module=environment, parent_class=(Environment, gym.Env) + ) + if _environment is not None: + return Environment.create( + environment=_environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + # Default: OpenAI Gym + try: + return Environment.create( + environment='gym', level=environment, + max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, + **kwargs + ) + except TensorforceError: + raise TensorforceError.value( + name='Environment.create', argument='environment', value=environment + ) + + else: + # Default: OpenAI Gym + import gym + if isinstance(environment, gym.Env) or \ + (isinstance(environment, type) and issubclass(environment, gym.Env)): + return Environment.create( + environment='gym', level=environment, + max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, + **kwargs + ) + + else: + raise TensorforceError.type( + name='Environment.create', argument='environment', dtype=type(environment) + ) + + def __init__(self): + # first two arguments, if applicable: level, visualize=False + util.overwrite_staticmethod(obj=self, function='create') + self._expect_receive = None + self._actions = None + self._num_parallel = None + self._reset_output_check = True + self._execute_output_check = True + def __str__(self): + return self.__class__.__name__ + + def states(self): + """ + Returns the state space specification. + + Returns: + specification: Arbitrarily nested dictionary of state descriptions with the following + attributes: +
                  +
                • type ("bool" | "int" | "float") – state data type + (default: "float").
                • +
                • shape (int | iter[int]) – state shape + (required).
                • +
                • num_states (int > 0) – number of discrete state values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum state value + (optional for type "float").
                • +
                + """ raise NotImplementedError - def close(self): + def actions(self): """ - Close environment. No other method calls possible afterwards. + Returns the action space specification. + + Returns: + specification: Arbitrarily nested dictionary of action descriptions with the following + attributes: +
                  +
                • type ("bool" | "int" | "float") – action data type + (required).
                • +
                • shape (int > 0 | iter[int > 0]) – action shape + (default: scalar).
                • +
                • num_actions (int > 0) – number of discrete action values + (required for type "int").
                • +
                • min_value/max_value (float) – minimum/maximum action value + (optional for type "float").
                • +
                """ - pass + raise NotImplementedError - def seed(self, seed): + def max_episode_timesteps(self): """ - Sets the random seed of the environment to the given value (current time, if seed=None). - Naturally deterministic Environments (e.g. ALE or some gym Envs) don't have to implement this method. + Returns the maximum number of timesteps per episode. - Args: - seed (int): The seed to use for initializing the pseudo-random number generator (default=epoch time in sec). - Returns: The actual seed (int) used OR None if Environment did not override this method (no seeding supported). + Returns: + int: Maximum number of timesteps per episode. """ return None - def reset(self): + def is_vectorizable(self): """ - Reset environment and setup for new episode. + Returns true if the environment is vectorizable. Returns: - initial state of reset environment. + bool: True if the environment is vectorizable. """ - raise NotImplementedError + return False - def execute(self, actions): + def num_actors(self): + """ + Returns the number of actors in this environment. + + Returns: + int >= 1: The number of actors. """ - Executes action, observes next state(s) and reward. + return 1 + + def close(self): + """ + Closes the environment. + """ + pass + + def episode_return(self, parallel=None): + return None + + def reset(self, num_parallel=None): + """ + Resets the environment to start a new episode. Args: - actions: Actions to execute. + num_parallel (int >= 1): Number of environment instances executed in parallel, only + valid if environment is vectorizable + (no vectorization). Returns: - (Dict of) next state(s), boolean indicating terminal, and reward signal. + (parallel,) dict[state]: Dictionary containing initial state(s) and auxiliary + information, and parallel index vector in case of vectorized execution. """ raise NotImplementedError - @property - def states(self): + def execute(self, actions): """ - Return the state space. Might include subdicts if multiple states are available simultaneously. + Executes the given action(s) and advances the environment by one step. - Returns: dict of state properties (shape and type). + Args: + actions (dict[action]): Dictionary containing action(s) to be executed + (required). + Returns: + (parallel,) dict[state], bool | 0 | 1 | 2, float: Dictionary containing next state(s) + and auxiliary information, whether a terminal state is reached or 2 if the episode was + aborted, observed reward, and parallel index vector in case of vectorized execution. """ raise NotImplementedError - @property + def start_reset(self, num_parallel=None): + if self._expect_receive is not None: + raise TensorforceError.unexpected() + self._expect_receive = 'reset' + assert num_parallel is None or self.is_vectorizable() + self._num_parallel = num_parallel + + def start_execute(self, actions): + if self._expect_receive is not None: + raise TensorforceError.unexpected() + self._expect_receive = 'execute' + assert self._actions is None + self._actions = actions + + def receive_execute(self): + if self._expect_receive == 'reset': + self._expect_receive = None + if self._num_parallel is None: + states = self.reset() + else: + parallel, states = self.reset(num_parallel=num_parallel) + if self._reset_output_check: + self._check_states_output(states=states, function='reset') + if self._num_parallel is not None: + TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( + x=parallel, batched=True, + message=(function + ': invalid {issue} for parallel.') + ) + self._reset_output_check = False + if self._num_parallel is None: + return states, -1, None + else: + return parallel, states, -1, None + + elif self._expect_receive == 'execute': + self._expect_receive = None + assert self._actions is not None + if self._num_parallel is None: + states, terminal, reward = self.execute(actions=self._actions) + else: + parallel, states, terminal, reward = self.execute(actions=self._actions) + if self._execute_output_check: + self._check_states_output(states=states, function='execute') + if self._num_parallel is None: + if isinstance(reward, (np.generic, np.ndarray)): + reward = reward.item() + if isinstance(terminal, (np.generic, np.ndarray)): + terminal = terminal.item() + if not isinstance(terminal, bool) and \ + (not isinstance(terminal, int) or terminal < 0 or terminal > 2): + raise TensorforceError( + 'Environment.execute: invalid value {} for terminal.'.format(terminal) + ) + if not isinstance(reward, (float, int)): + raise TensorforceError( + 'Environment.execute: invalid type {} for reward.'.format(type(reward)) + ) + else: + TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( + x=parallel, batched=True, + message=(function + ': invalid {issue} for parallel.') + ) + TensorSpec(type='bool', shape=()).np_assert( + x=terminal, batched=True, + message=(function + ': invalid {issue} for terminal.') + ) + TensorSpec(type='float', shape=()).np_assert( + x=reward, batched=True, + message=(function + ': invalid {issue} for reward.') + ) + self._execute_output_check = False + self._actions = None + if self._num_parallel is None: + return states, int(terminal), reward + else: + return parallel, states, terminal, reward + + else: + raise TensorforceError.unexpected() + + def _check_states_output(self, states, function): + function = 'Environment.' + function + states_spec = self.states() + if 'type' in states_spec or 'shape' in states_spec: + states_spec = TensorSpec(**states_spec) + if isinstance(states, dict): + for name in states: + if name != 'state' and not name.endswith('_mask'): + raise TensorforceError(function + ': invalid component {name} for state.') + _states = states['state'] + else: + _states = states + else: + states_spec = TensorsSpec(self.states()) + _states = ArrayDict() + for name, state in states.items(): + if name in states_spec: + _states[name] = state + elif not name.endswith('_mask'): + raise TensorforceError(function + ': invalid component {name} for state.') + states_spec.np_assert( + x=_states, batched=(self._num_parallel is not None), + message=(function + ': invalid {issue} for {name} state.') + ) + +class EnvironmentWrapper(Environment): + + def __init__(self, environment, max_episode_timesteps=None, reward_shaping=None): + super().__init__() + + if isinstance(environment, EnvironmentWrapper): + raise TensorforceError.unexpected() + if environment.max_episode_timesteps() is not None and \ + max_episode_timesteps is not None and \ + environment.max_episode_timesteps() < max_episode_timesteps: + raise TensorforceError.unexpected() + if environment.is_vectorizable() and environment.num_actors() > 1: + raise TensorforceError.unexpected() + + self._environment = environment + if max_episode_timesteps is None: + self._max_episode_timesteps = self._environment.max_episode_timesteps() + else: + self._max_episode_timesteps = int(max_episode_timesteps) + if self._environment.max_episode_timesteps() is None: + self._environment.max_episode_timesteps = (lambda: int(max_episode_timesteps)) + self._timestep = None + self._previous_states = None + self._reward_shaping = reward_shaping + + def __str__(self): + return str(self._environment) + + def states(self): + return self._environment.states() + def actions(self): - """ - Return the action space. Might include subdicts if multiple actions are available simultaneously. + return self._environment.actions() - Returns: dict of action properties (continuous, number of actions) + def max_episode_timesteps(self): + return self._max_episode_timesteps - """ + def is_vectorizable(self): + return self._environment.is_vectorizable() + + def num_actors(self): + return self._environment.num_actors() + + def close(self): + return self._environment.close() + + def reset(self, num_parallel=None): + self._timestep = 0 + assert num_parallel is None or self.is_vectorizable() + if self.num_actors() > 1: + self._num_parallel = self.num_actors() + else: + self._num_parallel = num_parallel + if self.num_actors() > 1: + parallel, states = self._environment.reset() + self._num_parallel = self.num_actors() + elif num_parallel is None: + states = self._environment.reset() + self._num_parallel = None + else: + parallel, states = self._environment.reset(num_parallel=num_parallel) + self._num_parallel = num_parallel + if self._reset_output_check: + self._check_states_output(states=states, function='reset') + if self._num_parallel is not None: + TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( + x=parallel, batched=True, + message=('Environment.reset: invalid {issue} for parallel.') + ) + self._reset_output_check = False + if self._reward_shaping is not None: + self._previous_states = states + if self._num_parallel is None: + return states + else: + return parallel, states + + def execute(self, actions): + if self._timestep is None: + raise TensorforceError( + message="An environment episode has to be initialized by calling reset() first." + ) + assert self._max_episode_timesteps is None or self._timestep < self._max_episode_timesteps + if self._num_parallel is None: + states, terminal, reward = self._environment.execute(actions=actions) + else: + parallel, states, terminal, reward = self._environment.execute(actions=actions) + if self._execute_output_check: + self._check_states_output(states=states, function='execute') + if self._num_parallel is None: + if isinstance(reward, (np.generic, np.ndarray)): + reward = reward.item() + if isinstance(terminal, (np.generic, np.ndarray)): + terminal = terminal.item() + if not isinstance(terminal, bool) and \ + (not isinstance(terminal, int) or terminal < 0 or terminal > 2): + raise TensorforceError( + 'Environment.execute: invalid value {} for terminal.'.format(terminal) + ) + if not isinstance(reward, (float, int)): + raise TensorforceError( + 'Environment.execute: invalid type {} for reward.'.format(type(reward)) + ) + else: + TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( + x=parallel, batched=True, + message='Environment.execute: invalid {issue} for parallel.' + ) + try: + TensorSpec(type='int', shape=(), num_values=3).np_assert( + x=terminal, batched=True, + message='Environment.execute: invalid {issue} for terminal.' + ) + except TensorforceError: + TensorSpec(type='bool', shape=()).np_assert( + x=terminal, batched=True, + message='Environment.execute: invalid {issue} for terminal.' + ) + TensorSpec(type='float', shape=()).np_assert( + x=reward, batched=True, + message='Environment.execute: invalid {issue} for reward.' + ) + self._execute_output_check = False + if self._reward_shaping is not None: + if isinstance(self._reward_shaping, str): + reward = eval(self._reward_shaping, dict(), dict( + states=self._previous_states, actions=actions, terminal=terminal, reward=reward, + next_states=states, math=math, np=np, random=random + )) + else: + reward = self._reward_shaping( + self._previous_states, actions, terminal, reward, states + ) + if isinstance(reward, tuple): + reward, terminal = reward + if isinstance(reward, (np.generic, np.ndarray)): + reward = reward.item() + if isinstance(terminal, (np.generic, np.ndarray)): + terminal = terminal.item() + self._previous_states = states + self._timestep += 1 + if self._num_parallel is None: + terminal = int(terminal) + if terminal == 0 and self._max_episode_timesteps is not None and \ + self._timestep >= self._max_episode_timesteps: + terminal = 2 + if terminal > 0: + self._timestep = None + return states, terminal, reward + else: + terminal = terminal.astype(util.np_dtype('int')) + if (terminal == 0).any() and self._max_episode_timesteps is not None and \ + self._timestep >= self._max_episode_timesteps: + terminal = np.where(terminal == 0, 2, terminal) + parallel = parallel[:0] + states = None + if (terminal > 0).all(): + self._timestep = None + return parallel, states, terminal, reward + + _ATTRIBUTES = frozenset([ + '_actions', 'create', '_environment', '_execute_output_check', '_expect_receive', + '_previous_states', '_max_episode_timesteps', '_num_parallel', '_reset_output_check', + '_reward_shaping', '_timestep' + ]) + + def __getattr__(self, name): + if name in EnvironmentWrapper._ATTRIBUTES: + return super().__getattr__(name) + else: + return getattr(self._environment, name) + + def __setattr__(self, name, value): + if name in EnvironmentWrapper._ATTRIBUTES: + super().__setattr__(name, value) + else: + return setattr(self._environment, name, value) + + +class RemoteEnvironment(Environment): + + @classmethod + def proxy_send(cls, connection, function, **kwargs): raise NotImplementedError - @staticmethod - def from_spec(spec, kwargs): - """ - Creates an environment from a specification dict. - """ - env = tensorforce.util.get_object( - obj=spec, - predefined_objects=tensorforce.environments.environments, - kwargs=kwargs - ) - assert isinstance(env, Environment) - return env + @classmethod + def proxy_receive(cls, connection): + raise NotImplementedError + + @classmethod + def proxy_close(cls, connection): + raise NotImplementedError + + @classmethod + def remote_send(cls, connection, success, result): + raise NotImplementedError + + @classmethod + def remote_receive(cls, connection): + raise NotImplementedError + + @classmethod + def remote_close(cls, connection): + raise NotImplementedError + + @classmethod + def remote( + cls, connection, environment, max_episode_timesteps=None, reward_shaping=None, **kwargs + ): + try: + env = None + env = Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + reward_shaping=reward_shaping, **kwargs + ) + + while True: + attribute, kwargs = cls.remote_receive(connection=connection) + + if attribute in ('reset', 'execute'): + environment_start = time.time() + + try: + result = getattr(env, attribute) + if callable(result): + if kwargs is None: + result = None + else: + result = result(**kwargs) + elif kwargs is None: + pass + elif len(kwargs) == 1 and 'value' in kwargs: + setattr(env, attribute, kwargs['value']) + result = None + else: + raise TensorforceError(message="Invalid remote attribute/function access.") + except AttributeError: + if kwargs is None or len(kwargs) != 1 or 'value' not in kwargs: + raise TensorforceError(message="Invalid remote attribute/function access.") + setattr(env, attribute, kwargs['value']) + result = None + + if attribute in ('reset', 'execute'): + seconds = time.time() - environment_start + if attribute == 'reset': + result = (result, seconds) + else: + result += (seconds,) + + cls.remote_send(connection=connection, success=True, result=result) + + if attribute == 'close': + break + + except BaseException: + etype, value, traceback = sys.exc_info() + cls.remote_send( + connection=connection, success=False, + result=(str(etype), str(value), format_tb(traceback)) + ) + + try: + if env is not None: + env.close() + except BaseException: + pass + finally: + etype, value, traceback = sys.exc_info() + cls.remote_send( + connection=connection, success=False, + result=(str(etype), str(value), format_tb(traceback)) + ) + + finally: + cls.remote_close(connection=connection) + + def __init__(self, connection, blocking=False): + super().__init__() + self._connection = connection + self._blocking = blocking + self._observation = None + self._thread = None + self._episode_seconds = None + + def send(self, function, kwargs): + if self._expect_receive is not None: + assert function != 'close' + self.close() + raise TensorforceError.unexpected() + self._expect_receive = function + + try: + self.__class__.proxy_send(connection=self._connection, function=function, kwargs=kwargs) + except BaseException: + self.__class__.proxy_close(connection=self._connection) + raise + + def receive(self, function): + if self._expect_receive != function: + assert function != 'close' + self.close() + raise TensorforceError.unexpected() + self._expect_receive = None + + try: + success, result = self.__class__.proxy_receive(connection=self._connection) + except BaseException: + self.__class__.proxy_close(connection=self._connection) + raise + + if success: + return result + else: + self.__class__.proxy_close(connection=self._connection) + etype, value, traceback = result + raise TensorforceError(message='\n{}\n{}: {}`'.format(''.join(traceback), etype, value)) + + _ATTRIBUTES = frozenset([ + '_actions', '_blocking', '_connection', 'create', '_episode_seconds', + '_execute_output_check', '_expect_receive', '_num_parallel', '_observation', + '_reset_output_check', '_thread' + ]) + + def __getattr__(self, name): + if name in RemoteEnvironment._ATTRIBUTES: + return super().__getattr__(name) + else: + self.send(function=name, kwargs=None) + result = self.receive(function=name) + if result is None: + def proxy_function(*args, **kwargs): + if len(args) > 0: + raise TensorforceError( + message="Remote environment function call requires keyword arguments." + ) + self.send(function=name, kwargs=kwargs) + return self.receive(function=name) + return proxy_function + else: + return result + + def __setattr__(self, name, value): + if name in RemoteEnvironment._ATTRIBUTES: + super().__setattr__(name, value) + else: + self.send(function=name, kwargs=dict(value=value)) + result = self.receive(function=name) + assert result is None + + def __str__(self): + self.send(function='__str__', kwargs=dict()) + return self.receive(function='__str__') + + def states(self): + self.send(function='states', kwargs=dict()) + return self.receive(function='states') + + def actions(self): + self.send(function='actions', kwargs=dict()) + return self.receive(function='actions') + + def max_episode_timesteps(self): + self.send(function='max_episode_timesteps', kwargs=dict()) + return self.receive(function='max_episode_timesteps') + + def is_vectorizable(self): + self.send(function='is_vectorizable', kwargs=dict()) + return self.receive(function='is_vectorizable') + + def num_actors(self): + self.send(function='num_actors', kwargs=dict()) + return self.receive(function='num_actors') + + def close(self): + if self._thread is not None: + self._thread.join() + if self._expect_receive is not None: + self.receive(function=self._expect_receive) + self.send(function='close', kwargs=dict()) + self.receive(function='close') + self.__class__.proxy_close(connection=self._connection) + self._connection = None + self._observation = None + self._thread = None + + def reset(self): + self._episode_seconds = 0.0 + self.send(function='reset', kwargs=dict()) + states, seconds = self.receive(function='reset') + self._episode_seconds += seconds + return states + + def execute(self, actions): + self.send(function='execute', kwargs=dict(actions=actions)) + states, terminal, reward, seconds = self.receive(function='execute') + self._episode_seconds += seconds + return states, int(terminal), reward + + def start_reset(self): + self._episode_seconds = 0.0 + if self._blocking: + self.send(function='reset', kwargs=dict()) + else: + if self._thread is not None: # TODO: not expected + self._thread.join() + self._observation = None + self._thread = Thread(target=self.finish_reset) + self._thread.start() + + def finish_reset(self): + assert self._thread is not None and self._observation is None + self._observation = (self.reset(), -1, None) + self._thread = None + + def start_execute(self, actions): + if self._blocking: + self.send(function='execute', kwargs=dict(actions=actions)) + else: + assert self._thread is None and self._observation is None + self._thread = Thread(target=self.finish_execute, kwargs=dict(actions=actions)) + self._thread.start() + + def finish_execute(self, actions): + assert self._thread is not None and self._observation is None + self._observation = self.execute(actions=actions) + self._thread = None + + def receive_execute(self): + if self._blocking: + if self._expect_receive == 'reset': + states, seconds = self.receive(function='reset') + self._episode_seconds += seconds + return states, -1, None + else: + states, terminal, reward, seconds = self.receive(function='execute') + self._episode_seconds += seconds + return states, int(terminal), reward + else: + if self._thread is not None: + return None + else: + assert self._observation is not None + observation = self._observation + self._observation = None + return observation diff --git a/tensorforce/environments/multiprocessing_environment.py b/tensorforce/environments/multiprocessing_environment.py new file mode 100644 index 000000000..ff96714fa --- /dev/null +++ b/tensorforce/environments/multiprocessing_environment.py @@ -0,0 +1,63 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from multiprocessing import Pipe, Process + +from tensorforce.environments import RemoteEnvironment + + +class MultiprocessingEnvironment(RemoteEnvironment): + """ + An earlier version of this code (#634) was originally developed by Vincent Belus (@vbelus). + """ + + @classmethod + def proxy_send(cls, connection, function, kwargs): + connection[0].send(obj=(function, kwargs)) + + @classmethod + def proxy_receive(cls, connection): + return connection[0].recv() + + @classmethod + def proxy_close(cls, connection): + connection[0].close() + connection[1].join() + + @classmethod + def remote_send(cls, connection, success, result): + connection.send(obj=(success, result)) + + @classmethod + def remote_receive(cls, connection): + return connection.recv() + + @classmethod + def remote_close(cls, connection): + connection.close() + + def __init__( + self, environment, blocking=False, max_episode_timesteps=None, reward_shaping=None, + **kwargs + ): + proxy_connection, remote_connection = Pipe(duplex=True) + process = Process( + target=self.__class__.remote, kwargs=dict( + connection=remote_connection, environment=environment, + max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs + ) + ) + process.start() + super().__init__(connection=(proxy_connection, process), blocking=blocking) diff --git a/tensorforce/environments/open_sim.py b/tensorforce/environments/open_sim.py new file mode 100644 index 000000000..f9975663b --- /dev/null +++ b/tensorforce/environments/open_sim.py @@ -0,0 +1,66 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.environments import Environment + + +class OpenSim(Environment): + """ + [OpenSim](http://osim-rl.stanford.edu/) environment adapter (specification key: `osim`, + `open_sim`). + + Args: + level ('Arm2D' | 'L2Run' | 'Prosthetics'): Environment id + (required). + visualize (bool): Whether to visualize interaction + (default: false). + integrator_accuracy (float): Integrator accuracy + (default: 5e-5). + """ + + @classmethod + def levels(cls): + return ['Arm2D', 'L2M2019', 'LegacyArm', 'LegacyRun'] + + def __init__(self, level, visualize=False, **kwargs): + super().__init__() + + from osim.env import Arm2DEnv, L2M2019Env + from osim.env.legacy.arm import ArmEnv + from osim.env.legacy.run import RunEnv + + environments = dict(Arm2D=Arm2DEnv, L2M2019=L2M2019Env, LegacyArm=ArmEnv, LegacyRun=RunEnv) + + self.environment = environments[level](visualize=visualize, **kwargs) + + def __str__(self): + return super().__str__() + '({})'.format(self.environment) + + def states(self): + return dict(type='float', shape=self.environment.get_observation_space_size()) + + def actions(self): + return dict(type='float', shape=self.environment.get_action_space_size()) + + def close(self): + self.environment.close() + self.environment = None + + def reset(self): + return self.environment.reset() + + def execute(self, actions): + states, reward, terminal, _ = self.env.step(action=actions) + return states, terminal, reward diff --git a/tensorforce/environments/openai_gym.py b/tensorforce/environments/openai_gym.py new file mode 100644 index 000000000..898e7d3ce --- /dev/null +++ b/tensorforce/environments/openai_gym.py @@ -0,0 +1,498 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import importlib + +import numpy as np + +from tensorforce import TensorforceError, util +from tensorforce.environments import Environment + + +class OpenAIGym(Environment): + """ + [OpenAI Gym](https://gym.openai.com/) environment adapter (specification key: `gym`, + `openai_gym`). + + May require: + ```bash + pip3 install gym + pip3 install gym[all] + ``` + + Args: + level (string | gym.Env): Gym id or instance + (required). + visualize (bool): Whether to visualize interaction + (default: false). + min_value (float): Lower bound clipping for otherwise unbounded state values + (default: no clipping). + max_value (float): Upper bound clipping for otherwise unbounded state values + (default: no clipping). + terminal_reward (float): Additional reward for early termination, if otherwise + indistinguishable from termination due to maximum number of timesteps + (default: Gym default). + reward_threshold (float): Gym environment argument, the reward threshold before the task is + considered solved + (default: Gym default). + drop_states_indices (list[int]): Drop states indices + (default: none). + visualize_directory (string): Visualization output directory + (default: none). + kwargs: Additional Gym environment arguments. + """ + + @classmethod + def levels(cls): + import gym + + return list(gym.envs.registry.env_specs) + + @classmethod + def create_level(cls, level, max_episode_steps, reward_threshold, **kwargs): + import gym + + requires_register = False + + # Find level + if level not in gym.envs.registry.env_specs: + if max_episode_steps is None: # interpret as false if level does not exist + max_episode_steps = False + env_specs = list(gym.envs.registry.env_specs) + if level + '-v0' in gym.envs.registry.env_specs: + env_specs.insert(0, level + '-v0') + search = level + level = None + for name in env_specs: + if search == name[:name.rindex('-v')]: + if level is None: + level = name + if max_episode_steps is False and \ + gym.envs.registry.env_specs[name].max_episode_steps is not None: + continue + elif max_episode_steps != gym.envs.registry.env_specs[name].max_episode_steps: + continue + level = name + break + else: + if level is None: + raise TensorforceError.value(name='OpenAIGym', argument='level', value=level) + assert level in cls.levels() + + # Check/update attributes + if max_episode_steps is None: + max_episode_steps = gym.envs.registry.env_specs[level].max_episode_steps + if max_episode_steps is None: + max_episode_steps = False + elif max_episode_steps != gym.envs.registry.env_specs[level].max_episode_steps: + if not ( + (max_episode_steps is False) and + (gym.envs.registry.env_specs[level].max_episode_steps is None) + ): + requires_register = True + if reward_threshold is None: + reward_threshold = gym.envs.registry.env_specs[level].reward_threshold + elif reward_threshold != gym.envs.registry.env_specs[level].reward_threshold: + requires_register = True + + if max_episode_steps is False: + max_episode_steps = None + + # Modified specification + if requires_register: + entry_point = gym.envs.registry.env_specs[level].entry_point + _kwargs = dict(gym.envs.registry.env_specs[level].kwargs) + nondeterministic = gym.envs.registry.env_specs[level].nondeterministic + + if '-v' in level and level[level.rindex('-v') + 2:].isdigit(): + version = int(level[level.rindex('-v') + 2:]) + level = level[:level.rindex('-v') + 2] + else: + version = -1 + while True: + version += 1 + if level + str(version) not in gym.envs.registry.env_specs: + level = level + str(version) + break + + gym.register( + id=level, entry_point=entry_point, reward_threshold=reward_threshold, + nondeterministic=nondeterministic, max_episode_steps=max_episode_steps, + kwargs=_kwargs + ) + assert level in cls.levels() + + return gym.make(id=level, **kwargs), max_episode_steps + + def __init__( + self, level, visualize=False, import_modules=None, min_value=None, max_value=None, + terminal_reward=0.0, reward_threshold=None, drop_states_indices=None, + visualize_directory=None, **kwargs + ): + super().__init__() + + import gym + import gym.wrappers + + if import_modules is None: + pass + elif isinstance(import_modules, str): + importlib.import_module(name=import_modules) + elif isinstance(import_modules, (list, tuple)): + for module in import_modules: + importlib.import_module(name=module) + + self.level = level + self.visualize = visualize + self.terminal_reward = terminal_reward + + if isinstance(level, gym.Env): + self.environment = self.level + self.level = self.level.__class__.__name__ + self._max_episode_timesteps = None + elif isinstance(level, type) and issubclass(level, gym.Env): + self.environment = self.level(**kwargs) + self.level = self.level.__class__.__name__ + self._max_episode_timesteps = None + else: + self.environment, self._max_episode_timesteps = self.__class__.create_level( + level=self.level, max_episode_steps=None, reward_threshold=reward_threshold, + **kwargs + ) + + if visualize_directory is not None: + self.environment = gym.wrappers.Monitor( + env=self.environment, directory=visualize_directory + ) + + self.min_value = min_value + self.max_value = max_value + if min_value is not None: + if max_value is None: + raise TensorforceError.required(name='OpenAIGym', argument='max_value') + self.states_spec = OpenAIGym.specs_from_gym_space( + space=self.environment.observation_space, min_value=min_value, max_value=max_value + ) + elif max_value is not None: + raise TensorforceError.required(name='OpenAIGym', argument='min_value') + else: + self.states_spec = OpenAIGym.specs_from_gym_space( + space=self.environment.observation_space, allow_infinite_box_bounds=True + ) + + if drop_states_indices is None: + self.drop_states_indices = None + else: + assert 'shape' in self.states_spec + self.drop_states_indices = sorted(drop_states_indices) + assert len(self.states_spec['shape']) == 1 + num_dropped = len(self.drop_states_indices) + self.states_spec['shape'] = (self.states_spec['shape'][0] - num_dropped,) + + self.actions_spec = OpenAIGym.specs_from_gym_space(space=self.environment.action_space) + + def __str__(self): + return super().__str__() + '({})'.format(self.level) + + def states(self): + return self.states_spec + + def actions(self): + return self.actions_spec + + def max_episode_timesteps(self): + return self._max_episode_timesteps + + def close(self): + self.environment.close() + self.environment = None + + def reset(self): + import gym.wrappers + + if isinstance(self.environment, gym.wrappers.Monitor): + self.environment.stats_recorder.done = True + states = self.environment.reset() + self.timestep = 0 + states = OpenAIGym.flatten_state( + state=states, states_spec=self.states_spec, actions_spec=self.actions_spec + ) + if self.min_value is not None: + states = np.clip(states, self.states_spec['min_value'], self.states_spec['max_value']) + if self.drop_states_indices is not None: + for index in reversed(self.drop_states_indices): + states = np.concatenate([states[:index], states[index + 1:]]) + + return states + + def execute(self, actions): + if self.visualize: + self.environment.render() + actions = OpenAIGym.unflatten_action(action=actions) + states, reward, terminal, _ = self.environment.step(actions) + + self.timestep += 1 + if self._max_episode_timesteps is not None and self.timestep == self._max_episode_timesteps: + assert terminal + terminal = 2 + elif terminal: + assert self._max_episode_timesteps is None or \ + self.timestep < self._max_episode_timesteps + reward += self.terminal_reward + terminal = 1 + else: + terminal = 0 + states = OpenAIGym.flatten_state( + state=states, states_spec=self.states_spec, actions_spec=self.actions_spec + ) + if self.min_value is not None: + states = np.clip(states, self.states_spec['min_value'], self.states_spec['max_value']) + if self.drop_states_indices is not None: + for index in reversed(self.drop_states_indices): + states = np.concatenate([states[:index], states[index + 1:]]) + return states, terminal, reward + + @staticmethod + def specs_from_gym_space( + space, allow_infinite_box_bounds=False, min_value=None, max_value=None + ): + import gym + + if isinstance(space, gym.spaces.Discrete): + return dict(type='int', shape=(), num_values=space.n) + + elif isinstance(space, gym.spaces.MultiBinary): + return dict(type='bool', shape=space.n) + + elif isinstance(space, gym.spaces.MultiDiscrete): + if (space.nvec == space.nvec.item(0)).all(): + return dict(type='int', shape=space.nvec.shape, num_values=space.nvec.item(0)) + else: + specs = dict() + nvec = space.nvec.flatten() + shape = '_'.join(str(x) for x in space.nvec.shape) + for n in range(nvec.shape[0]): + specs['gymmdc{}_{}'.format(n, shape)] = dict( + type='int', shape=(), num_values=nvec[n] + ) + return specs + + elif isinstance(space, gym.spaces.Box): + spec = dict(type='float', shape=space.shape) + + if (space.low == space.low.item(0)).all(): + _min_value = float(space.low.item(0)) + if _min_value > -1e6: + spec['min_value'] = _min_value + else: + spec['min_value'] = min_value + elif allow_infinite_box_bounds: + _min_value = np.where(space.low <= -1e6, -np.inf, space.low) + spec['min_value'] = _min_value.astype(util.np_dtype(dtype='float')) + elif (space.low > -1e6).all(): + spec['min_value'] = space.low.astype(util.np_dtype(dtype='float')) + elif min_value is None: + raise TensorforceError("Invalid infinite box bounds") + else: + _min_value = np.where(space.low <= -1e6, min_value, space.low) + spec['min_value'] = _min_value.astype(util.np_dtype(dtype='float')) + + if spec is None: + pass + elif (space.high == space.high.item(0)).all(): + _max_value = float(space.high.item(0)) + if _max_value < 1e6: + spec['max_value'] = _max_value + else: + spec['max_value'] = max_value + elif allow_infinite_box_bounds: + _max_value = np.where(space.high >= 1e6, np.inf, space.high) + spec['max_value'] = _max_value.astype(util.np_dtype(dtype='float')) + elif (space.high < 1e6).all(): + spec['max_value'] = space.high.astype(util.np_dtype(dtype='float')) + elif max_value is None: + raise TensorforceError("OpenAIGym: Invalid infinite box bounds") + else: + _max_value = np.where(space.high >= 1e6, max_value, space.high) + spec['max_value'] = _max_value.astype(util.np_dtype(dtype='float')) + + if spec is None: + specs = dict() + low = space.low.flatten() + high = space.high.flatten() + shape = '_'.join(str(x) for x in space.low.shape) + for n in range(low.shape[0]): + spec = dict(type='float', shape=()) + if low[n] > -1e6: + spec['min_value'] = float(low[n]) + if high[n] < 1e6: + spec['max_value'] = float(high[n]) + specs['gymbox{}_{}'.format(n, shape)] = spec + return specs + else: + return spec + + elif isinstance(space, gym.spaces.Tuple): + specs = dict() + n = 0 + for n, space in enumerate(space.spaces): + spec = OpenAIGym.specs_from_gym_space( + space=space, allow_infinite_box_bounds=allow_infinite_box_bounds + ) + if 'type' in spec: + specs['gymtpl{}'.format(n)] = spec + else: + for name, spec in spec.items(): + specs['gymtpl{}_{}'.format(n, name)] = spec + return specs + + elif isinstance(space, gym.spaces.Dict): + specs = dict() + for space_name, space in space.spaces.items(): + spec = OpenAIGym.specs_from_gym_space( + space=space, allow_infinite_box_bounds=allow_infinite_box_bounds + ) + if 'type' in spec: + specs[space_name] = spec + else: + for name, spec in spec.items(): + specs['{}_{}'.format(space_name, name)] = spec + return specs + + else: + raise TensorforceError('Unknown Gym space.') + + @staticmethod + def flatten_state(state, states_spec, actions_spec=None): + if isinstance(state, tuple): + states = dict() + for n, state in enumerate(state): + if 'gymtpl{}'.format(n) in states_spec: + spec = states_spec['gymtpl{}'.format(n)] + else: + spec = None + for name in states_spec: + if name.startswith('gymtpl{}_'.format(n)): + assert spec is None + spec = states_spec[name] + assert spec is not None + state = OpenAIGym.flatten_state(state=state, states_spec=spec) + if isinstance(state, dict): + for name, state in state.items(): + states['gymtpl{}_{}'.format(n, name)] = state + else: + states['gymtpl{}'.format(n)] = state + return states + + elif isinstance(state, dict): + states = dict() + if actions_spec is not None: + if 'type' in actions_spec: + if 'action_mask' in state: + states['action_mask'] = state.pop('action_mask') + else: + for action_name in actions_spec: + action_name = '{}_mask'.format(action_name) + if action_name in state: + states[action_name] = state.pop(action_name) + for state_name, state in state.items(): + if state_name in states_spec: + spec = states_spec[state_name] + else: + spec = None + for name in states_spec: + if name.startswith('{}_'.format(state_name)): + assert spec is None + spec = states_spec[name] + assert spec is not None + state = OpenAIGym.flatten_state(state=state, states_spec=spec) + if isinstance(state, dict): + for name, state in state.items(): + states['{}_{}'.format(state_name, name)] = state + else: + states[state_name] = state + return states + + elif np.isinf(state).any() or np.isnan(state).any(): + raise TensorforceError("State contains inf or nan.") + + elif 'gymbox0' in states_spec: + states = dict() + state = state.flatten() + shape = '_'.join(str(x) for x in state.shape) + for n in range(state.shape[0]): + states['gymbox{}_{}'.format(n, shape)] = state[n] + return states + + elif 'gymmdc0' in states_spec: + states = dict() + state = state.flatten() + shape = '_'.join(str(x) for x in state.shape) + for n in range(state.shape[0]): + states['gymmdc{}_{}'.format(n, shape)] = state[n] + return states + + else: + return state + + @staticmethod + def unflatten_action(action): + if not isinstance(action, dict): + if np.isinf(action).any() or np.isnan(action).any(): + raise TensorforceError("Action contains inf or nan.") + + return action + + elif all(name.startswith('gymmdc') for name in action) or \ + all(name.startswith('gymbox') for name in action) or \ + all(name.startswith('gymtpl') for name in action): + space_type = next(iter(action))[:6] + actions = list() + n = 0 + while True: + if any(name.startswith(space_type + str(n) + '_') for name in action): + inner_action = [ + value for name, value in action.items() + if name.startswith(space_type + str(n)) + ] + assert len(inner_action) == 1 + actions.append(OpenAIGym.unflatten_action(action=inner_action[0])) + elif any(name == space_type + str(n) for name in action): + actions.append(OpenAIGym.unflatten_action(action=action[space_type + str(n)])) + else: + break + n += 1 + if all(name.startswith('gymmdc') for name in action) or \ + all(name.startswith('gymbox') for name in action): + name = next(iter(action)) + shape = tuple(int(x) for x in name[name.index('_') + 1:].split('_')) + return np.array(actions).reshape(shape) + else: + return tuple(actions) + + else: + actions = dict() + for name, action in action.items(): + if '_' in name: + name, inner_name = name.split('_', 1) + if name not in actions: + actions[name] = dict() + actions[name][inner_name] = action + else: + actions[name] = action + for name, action in actions.items(): + if isinstance(action, dict): + actions[name] = OpenAIGym.unflatten_action(action=action) + return actions diff --git a/tensorforce/environments/openai_retro.py b/tensorforce/environments/openai_retro.py new file mode 100644 index 000000000..72615fd1c --- /dev/null +++ b/tensorforce/environments/openai_retro.py @@ -0,0 +1,58 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tensorforce.environments import OpenAIGym + + +class OpenAIRetro(OpenAIGym): + """ + [OpenAI Retro](https://github.com/openai/retro) environment adapter (specification key: + `retro`, `openai_retro`). + + May require: + ```bash + pip3 install gym-retro + ``` + + Args: + level (string): Game id + (required). + visualize (bool): Whether to visualize interaction + (default: false). + monitor_directory (string): Monitor output directory + (default: none). + kwargs: Additional Retro environment arguments. + """ + + @classmethod + def levels(cls): + import retro + + return list(retro.data.list_games()) + + @classmethod + def create_level(cls, level, max_episode_steps, reward_threshold, **kwargs): + import retro + + assert max_episode_steps is None and reward_threshold is None + + return retro.make(game=level, **kwargs), None + + def __init__(self, level, visualize=False, visualize_directory=None, **kwargs): + import retro + + super().__init__( + level=level, visualize=visualize, visualize_directory=visualize_directory, **kwargs + ) diff --git a/tensorforce/environments/pygame_learning_environment.py b/tensorforce/environments/pygame_learning_environment.py new file mode 100644 index 000000000..461e4d58c --- /dev/null +++ b/tensorforce/environments/pygame_learning_environment.py @@ -0,0 +1,129 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict +import os + +import numpy as np + +from tensorforce import TensorforceError +from tensorforce.environments import Environment + + +class PyGameLearningEnvironment(Environment): + """ + [PyGame Learning Environment](https://github.com/ntasfi/PyGame-Learning-Environment/) + environment adapter (specification key: `ple`, `pygame_learning_environment`). + + May require: + ```bash + sudo apt-get install git python3-dev python3-setuptools python3-numpy python3-opengl \ + libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev libsmpeg-dev libsdl1.2-dev \ + libportmidi-dev libswscale-dev libavformat-dev libavcodec-dev libtiff5-dev libx11-6 \ + libx11-dev fluid-soundfont-gm timgm6mb-soundfont xfonts-base xfonts-100dpi xfonts-75dpi \ + xfonts-cyrillic fontconfig fonts-freefont-ttf libfreetype6-dev + + pip3 install pygame + pip3 install git+https://github.com/ntasfi/PyGame-Learning-Environment.git + ``` + + Args: + level (string | subclass of `ple.games.base`): Game instance or name of class in + `ple.games`, like "Catcher", "Doom", "FlappyBird", "MonsterKong", "Pixelcopter", + "Pong", "PuckWorld", "RaycastMaze", "Snake", "WaterWorld" + (required). + visualize (bool): Whether to visualize interaction + (default: false). + frame_skip (int > 0): Number of times to repeat an action without observing + (default: 1). + fps (int > 0): The desired frames per second we want to run our game at + (default: 30). + """ + + @classmethod + def levels(cls): + import ple + + levels = list() + for level in dir(ple.games): + level_cls = getattr(ple.games, level) + if isinstance(level_cls, type) and issubclass(level_cls, ple.games.base.PyGameWrapper): + levels.append(level) + return levels + + def __init__(self, level, visualize=False, frame_skip=1, fps=30): + super().__init__() + + import ple + + if isinstance(level, str): + assert level in PyGameLearningEnvironment.levels() + level = getattr(ple.games, level)() + + if not visualize: + os.putenv('SDL_VIDEODRIVER', 'fbcon') + os.environ['SDL_VIDEODRIVER'] = 'dummy' + + self.environment = ple.PLE( + game=level, fps=fps, frame_skip=frame_skip, display_screen=visualize + # num_steps=1, reward_values={}, force_fps=True, add_noop_action=True, NOOP=K_F15, + # state_preprocessor=None, rng=24 + ) + self.environment.init() + + self.has_game_state = self.environment.getGameStateDims() is not None + self.available_actions = tuple(self.environment.getActionSet()) + + def __str__(self): + return super().__str__() + '({})'.format(self.environment.__class__.__name__) + + def states(self): + if self.has_game_state: + return OrderedDict( + screen=dict( + type='float', shape=(tuple(self.environment.getScreenDims()) + (3,)), + min_value=0.0, max_value=1.0 + ), state=dict(type='float', shape=(tuple(self.environment.getGameStateDims) + (3,))) + ) + else: + return dict( + type='float', shape=(tuple(self.environment.getScreenDims()) + (3,)), + min_value=0.0, max_value=1.0 + ) + + def actions(self): + return dict(type='int', shape=(), num_values=len(self.available_actions)) + + def close(self): + self.environment = None + + def get_states(self): + screen = self.environment.getScreenRGB().astype(dtype=np.float32) / 255.0 + if self.has_game_state: + return OrderedDict(screen=screen, state=self.environment.getGameState()) + else: + return screen + + def reset(self): + self.environment.reset_game() + return self.get_states() + + def execute(self, actions): + if self.environment.game_over(): + raise TensorforceError.unexpected() + reward = self.environment.act(action=self.available_actions[actions]) + terminal = self.environment.game_over() + states = self.get_states() + return states, terminal, reward diff --git a/tensorforce/environments/socket_environment.py b/tensorforce/environments/socket_environment.py new file mode 100644 index 000000000..ebc7af531 --- /dev/null +++ b/tensorforce/environments/socket_environment.py @@ -0,0 +1,156 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from socket import SHUT_RDWR, socket as Socket +import time + +import msgpack +import msgpack_numpy + +from tensorforce import TensorforceError +from tensorforce.environments import RemoteEnvironment + + +msgpack_numpy.patch() + + +class SocketEnvironment(RemoteEnvironment): + """ + An earlier version of this code (#626) was originally developed as part of the following work: + + Rabault, J., Kuhnle, A (2019). Accelerating Deep Reinforcement Leaning strategies of Flow + Control through a multi-environment approach. Physics of Fluids. + """ + + MAX_BYTES = 4096 + + @classmethod + def remote(cls, port, environment, max_episode_timesteps=None, reward_shaping=None, **kwargs): + socket = Socket() + socket.bind(('', port)) + socket.listen(1) + connection, address = socket.accept() + socket.close() + super().remote( + connection=connection, environment=environment, + max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs + ) + + @classmethod + def proxy_send(cls, connection, function, kwargs): + str_function = function.encode() + num_bytes = len(str_function) + str_num_bytes = '{:08d}'.format(num_bytes).encode() + bytes_sent = connection.send(str_num_bytes + str_function) + if bytes_sent != num_bytes + 8: + raise TensorforceError.unexpected() + + str_kwargs = msgpack.packb(o=kwargs) + num_bytes = len(str_kwargs) + str_num_bytes = '{:08d}'.format(num_bytes).encode() + bytes_sent = connection.send(str_num_bytes + str_kwargs) + if bytes_sent != num_bytes + 8: + raise TensorforceError.unexpected() + + @classmethod + def proxy_receive(cls, connection): + str_success = connection.recv(1) + if len(str_success) != 1 or (str_success != b'0' and str_success != b'1'): + raise TensorforceError.unexpected() + success = (str_success == b'1') + + str_num_bytes = connection.recv(8) + if len(str_num_bytes) != 8: + raise TensorforceError.unexpected() + num_bytes = int(str_num_bytes.decode()) + str_result = b'' + for n in range(num_bytes // cls.MAX_BYTES): + str_result += connection.recv(cls.MAX_BYTES) + if len(str_result) != n * cls.MAX_BYTES: + raise TensorforceError.unexpected() + str_result += connection.recv(num_bytes % cls.MAX_BYTES) + if len(str_result) != num_bytes: + raise TensorforceError.unexpected() + result = msgpack.unpackb(packed=str_result) + + return success, result + + @classmethod + def proxy_close(cls, connection): + connection.shutdown(SHUT_RDWR) + connection.close() + + @classmethod + def remote_send(cls, connection, success, result): + str_success = b'1' if success else b'0' + bytes_sent = connection.send(str_success) + if bytes_sent != 1: + raise TensorforceError.unexpected() + + str_result = msgpack.packb(o=result) + num_bytes = len(str_result) + str_num_bytes = '{:08d}'.format(num_bytes).encode() + bytes_sent = connection.send(str_num_bytes + str_result) + if bytes_sent != num_bytes + 8: + raise TensorforceError.unexpected() + + @classmethod + def remote_receive(cls, connection): + str_num_bytes = connection.recv(8) + if len(str_num_bytes) != 8: + raise TensorforceError.unexpected() + num_bytes = int(str_num_bytes.decode()) + str_function = b'' + for n in range(num_bytes // cls.MAX_BYTES): + str_function += connection.recv(cls.MAX_BYTES) + if len(str_function) != n * cls.MAX_BYTES: + raise TensorforceError.unexpected() + str_function += connection.recv(num_bytes % cls.MAX_BYTES) + if len(str_function) != num_bytes: + raise TensorforceError.unexpected() + function = str_function.decode() + + str_num_bytes = connection.recv(8) + if len(str_num_bytes) != 8: + raise TensorforceError.unexpected() + num_bytes = int(str_num_bytes.decode()) + str_kwargs = b'' + for n in range(num_bytes // cls.MAX_BYTES): + str_kwargs += connection.recv(cls.MAX_BYTES) + if len(str_kwargs) != n * cls.MAX_BYTES: + raise TensorforceError.unexpected() + str_kwargs += connection.recv(num_bytes % cls.MAX_BYTES) + if len(str_kwargs) != num_bytes: + raise TensorforceError.unexpected() + kwargs = msgpack.unpackb(packed=str_kwargs) + + return function, kwargs + + @classmethod + def remote_close(cls, connection): + connection.shutdown(SHUT_RDWR) + connection.close() + + def __init__(self, host, port, blocking=False): + socket = Socket() + for _ in range(100): # TODO: 10sec timeout, not configurable + try: + socket.connect((host, port)) + break + except ConnectionRefusedError: + time.sleep(0.1) + else: + raise TensorforceError("Remote socket connection could not be established.") + super().__init__(connection=socket, blocking=blocking) diff --git a/tensorforce/environments/vizdoom.py b/tensorforce/environments/vizdoom.py new file mode 100644 index 000000000..d80fce31a --- /dev/null +++ b/tensorforce/environments/vizdoom.py @@ -0,0 +1,145 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict +import itertools + +import numpy as np + +from tensorforce.environments import Environment + + +class ViZDoom(Environment): + """ + [ViZDoom](https://github.com/mwydmuch/ViZDoom) environment adapter (specification key: + `vizdoom`). + + May require: + ```bash + sudo apt-get install g++ build-essential libsdl2-dev zlib1g-dev libmpg123-dev libjpeg-dev \ + libsndfile1-dev nasm tar libbz2-dev libgtk2.0-dev make cmake git chrpath timidity \ + libfluidsynth-dev libgme-dev libopenal-dev timidity libwildmidi-dev unzip libboost-all-dev \ + liblua5.1-dev + + pip3 install vizdoom + ``` + + Args: + level (string): ViZDoom configuration file + (required). + include_variables (bool): Whether to include game variables to state + (default: false). + factored_action (bool): Whether to use factored action representation + (default: false). + visualize (bool): Whether to visualize interaction + (default: false). + frame_skip (int > 0): Number of times to repeat an action without observing + (default: 12). + seed (int): Random seed + (default: none). + """ + + def __init__( + self, level, visualize=False, include_variables=False, factored_action=False, + frame_skip=12, seed=None + ): + super().__init__() + + from vizdoom import DoomGame, Mode, ScreenFormat, ScreenResolution + + self.config_file = level + self.include_variables = include_variables + self.factored_action = factored_action + self.visualize = visualize + self.frame_skip = frame_skip + + self.environment = DoomGame() + self.environment.load_config(self.config_file) + if self.visualize: + self.environment.set_window_visible(True) + self.environment.set_mode(Mode.ASYNC_PLAYER) + else: + self.environment.set_window_visible(False) + self.environment.set_mode(Mode.PLAYER) + # e.g. CRCGCB, RGB24, GRAY8 + self.environment.set_screen_format(ScreenFormat.RGB24) + # e.g. RES_320X240, RES_640X480, RES_1920X1080 + self.environment.set_screen_resolution(ScreenResolution.RES_640X480) + self.environment.set_depth_buffer_enabled(False) + self.environment.set_labels_buffer_enabled(False) + self.environment.set_automap_buffer_enabled(False) + if seed is not None: + self.environment.setSeed(seed) + self.environment.init() + + self.state_shape = (480, 640, 3) + self.num_variables = self.environment.get_available_game_variables_size() + self.num_buttons = self.environment.get_available_buttons_size() + self.available_actions = [ + tuple(a) for a in itertools.product([0, 1], repeat=self.num_buttons) + ] + + def __str__(self): + return super().__str__() + '({})'.format(self.config_file) + + def states(self): + if self.include_variables: + return OrderedDict( + screen=dict(type='float', shape=self.state_shape, min_value=0.0, max_value=1.0), + variables=dict(type='float', shape=self.num_variables) + ) + else: + return dict(type='float', shape=self.state_shape, min_value=0.0, max_value=1.0) + + def actions(self): + if self.factored_action: + return dict(type='bool', shape=self.num_buttons) + else: + return dict(type='int', shape=(), num_values=len(self.available_actions)) + + def close(self): + self.environment.close() + self.environment = None + + def get_states(self): + state = self.environment.get_state() + screen = state.screen_buffer.astype(dtype=np.float32) / 255.0 + if self.include_variables: + return OrderedDict(screen=screen, variables=state.game_variables) + else: + return screen + + def reset(self): + self.environment.new_episode() + self.current_states = self.get_states() + return self.current_states + + def execute(self, actions): + if self.factored_action: + action = np.where(actions, 1.0, 0.0) + else: + action = self.available_actions[actions] + if self.visualize: + self.environment.set_action(action) + reward = 0.0 + for _ in range(self.frame_skip): + self.environment.advance_action() + reward += self.environment.get_last_reward() + else: + reward = self.environment.make_action(list(action), self.frame_skip) + terminal = self.environment.is_episode_finished() + if not terminal: + self.current_states = self.get_states() + return self.current_states, terminal, reward diff --git a/tensorforce/exception.py b/tensorforce/exception.py index 09981529c..6263fbeaf 100755 --- a/tensorforce/exception.py +++ b/tensorforce/exception.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,14 +14,182 @@ # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +def is_iterable(x): + if isinstance(x, str): + return False + try: + iter(x) + return True + except TypeError: + return False -class TensorForceError(Exception): +class TensorforceError(Exception): """ - TensorForce error + Tensorforce error """ - pass + def __init__(self, message): + if message[0].islower(): + message = message[0].upper() + message[1:] + if message[-1] not in '.!?': + message = message + '.' + super().__init__(message) + + @staticmethod + def unexpected(): + return TensorforceError(message="Unexpected error!") + + @staticmethod + def collision(name, value, group1, group2): + return TensorforceError( + message="{name} collision between {group1} and {group2}: {value}.".format( + name=name, group1=group1, group2=group2, value=value + ) + ) + + @staticmethod + def mismatch(name, value1, value2, argument=None): + if argument is None: + return TensorforceError( + message="{name} mismatch: {value2} != {value1}.".format( + name=name, value1=value1, value2=value2 + ) + ) + else: + return TensorforceError( + message="{name} mismatch for argument {argument}: {value2} != {value1}.".format( + name=name, argument=argument, value1=value1, value2=value2 + ) + ) + + @staticmethod + def exists(name, value): + return TensorforceError( + message="{name} already exists: {value}.".format(name=name, value=value) + ) + + @staticmethod + def exists_not(name, value): + return TensorforceError( + message="{name} does not exist: {value}.".format(name=name, value=value) + ) + + @staticmethod + def required_attribute(name, attribute): + return TensorforceError( + message="Required {name} attribute {attribute}.".format(name=name, attribute=attribute) + ) + + @staticmethod + def required(name, argument, expected=None, condition=None): + if condition is None: + if expected is None: + return TensorforceError( + message="Required {name} argument {argument}.".format( + name=name, argument=argument + ) + ) + else: + return TensorforceError( + message="Required {name} argument {argument} to be {expected}.".format( + name=name, argument=argument, expected=expected + ) + ) + else: + if expected is None: + return TensorforceError( + message="Required {name} argument {argument} given {condition}.".format( + name=name, argument=argument, condition=condition + ) + ) + else: + return TensorforceError( + message="Required {name} argument {argument} to be {expected} given " + "{condition}.".format( + name=name, argument=argument, expected=expected, condition=condition + ) + ) + + @staticmethod + def invalid(name, argument, condition=None): + if condition is None: + return TensorforceError( + message="Invalid {name} argument {argument}.".format(name=name, argument=argument) + ) + else: + return TensorforceError( + message="Invalid {name} argument {argument} given {condition}.".format( + name=name, condition=condition, argument=argument + ) + ) + + @staticmethod + def type(name, argument, dtype, condition=None, hint=None): + if hint is None: + if condition is None: + return TensorforceError( + message="Invalid type for {name} argument {argument}: {type}.".format( + name=name, argument=argument, type=dtype + ) + ) + else: + return TensorforceError( + message="Invalid type for {name} argument {argument} given {condition}: {type}.".format( + name=name, argument=argument, condition=condition, type=dtype + ) + ) + else: + if condition is None: + return TensorforceError( + message="Invalid type for {name} argument {argument}: {type} {hint}.".format( + name=name, argument=argument, type=dtype, hint=hint + ) + ) + else: + return TensorforceError( + message="Invalid type for {name} argument {argument} given {condition}: {type} {hint}.".format( + name=name, argument=argument, condition=condition, type=dtype, hint=hint + ) + ) + + @staticmethod + def value(name, argument, value, condition=None, hint=None): + if isinstance(value, dict): + value = str(value) + elif is_iterable(x=value): + value = ','.join(str(x) for x in value) + if hint is None: + if condition is None: + return TensorforceError( + message="Invalid value for {name} argument {argument}: {value}.".format( + name=name, argument=argument, value=value + ) + ) + else: + return TensorforceError( + message="Invalid value for {name} argument {argument} given {condition}: {value}.".format( + name=name, argument=argument, condition=condition, value=value + ) + ) + else: + if condition is None: + return TensorforceError( + message="Invalid value for {name} argument {argument}: {value} {hint}.".format( + name=name, argument=argument, value=value, hint=hint + ) + ) + else: + return TensorforceError( + message="Invalid value for {name} argument {argument} given {condition}: {value} {hint}.".format( + name=name, argument=argument, condition=condition, value=value, hint=hint + ) + ) + + @staticmethod + def deprecated(name, argument, replacement): + return DeprecationWarning( + "Deprecated {name} argument {argument}, use {replacement} instead.".format( + name=name, argument=argument, replacement=replacement + ) + ) diff --git a/tensorforce/execution/__init__.py b/tensorforce/execution/__init__.py index d3beee487..fef4ec499 100644 --- a/tensorforce/execution/__init__.py +++ b/tensorforce/execution/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,7 @@ # limitations under the License. # ============================================================================== -from tensorforce.execution.base_runner import BaseRunner -from tensorforce.execution.runner import Runner, SingleRunner, DistributedTFRunner -from tensorforce.execution.threaded_runner import ThreadedRunner, WorkerAgentGenerator +from tensorforce.execution.runner import Runner -__all__ = ['BaseRunner', 'SingleRunner', 'DistributedTFRunner', 'Runner', 'ThreadedRunner', 'WorkerAgentGenerator'] + +__all__ = ['Runner'] diff --git a/tensorforce/execution/base_runner.py b/tensorforce/execution/base_runner.py deleted file mode 100644 index f0e7cbb5f..000000000 --- a/tensorforce/execution/base_runner.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - - -class BaseRunner(object): - """ - Base class for all runner classes. - Implements the `run` method. - """ - def __init__(self, agent, environment, repeat_actions=1, history=None): - """ - Args: - agent (Agent): Agent object (or list of Agent objects) to use for the run. - environment (Environment): Environment object (or list of Environment objects) to use for the run. - repeat_actions (int): How many times the same given action will be repeated in subsequent calls to - Environment's `execute` method. Rewards collected in these calls are accumulated and reported - as a sum in the following call to Agent's `observe` method. - history (dict): A dictionary containing an already run experiment's results. Keys should be: - episode_rewards (list of rewards), episode_timesteps (lengths of episodes), episode_times (run-times) - """ - self.agent = agent - self.environment = environment - self.repeat_actions = repeat_actions - - self.global_episode = None # the global episode number (across all (parallel) agents) - self.global_timestep = None # the global time step (across all (parallel) agents) - - self.start_time = None # TODO: is this necessary here? global start time (episode?, overall?) - - # lists of episode data (rewards, wall-times/timesteps) - self.episode_rewards = None # list of accumulated episode rewards - self.episode_timesteps = None # list of total timesteps taken in the episodes - self.episode_times = None # list of durations for the episodes - - self.reset(history) - - def reset(self, history=None): - """ - Resets the Runner's internal stats counters. - If history is empty, use default values in history.get(). - - Args: - history (dict): A dictionary containing an already run experiment's results. Keys should be: - episode_rewards (list of rewards), episode_timesteps (lengths of episodes), episode_times (run-times) - """ - if not history: - history = dict() - - self.episode_rewards = history.get("episode_rewards", list()) - self.episode_timesteps = history.get("episode_timesteps", list()) - self.episode_times = history.get("episode_times", list()) - - def close(self): - """ - Should perform clean up operations on Runner's Agent(s) and Environment(s). - """ - raise NotImplementedError - - def run(self, num_episodes, num_timesteps, max_episode_timesteps, deterministic, episode_finished, summary_report, - summary_interval): - """ - Executes this runner by starting to act (via Agent(s)) in the given Environment(s). - Stops execution according to certain conditions (e.g. max. number of episodes, etc..). - Calls callback functions after each episode and/or after some summary criteria are met. - - Args: - num_episodes (int): Max. number of episodes to run globally in total (across all threads/workers). - num_timesteps (int): Max. number of time steps to run globally in total (across all threads/workers) - max_episode_timesteps (int): Max. number of timesteps per episode. - deterministic (bool): Whether to use exploration when selecting actions. - episode_finished (callable): A function to be called once an episodes has finished. Should take - a BaseRunner object and some worker ID (e.g. thread-ID or task-ID). Can decide for itself - every how many episodes it should report something and what to report. - summary_report (callable): Deprecated; Function that could produce a summary over the training - progress so far. - summary_interval (int): Deprecated; The number of time steps to execute (globally) - before summary_report is called. - """ - raise NotImplementedError - - # keep backwards compatibility - @property - def episode(self): - """ - Deprecated property `episode` -> global_episode. - """ - return self.global_episode - - @property - def timestep(self): - """ - Deprecated property `timestep` -> global_timestep. - """ - return self.global_timestep - - diff --git a/tensorforce/execution/runner.py b/tensorforce/execution/runner.py index e3e84ef39..15f75eb9c 100644 --- a/tensorforce/execution/runner.py +++ b/tensorforce/execution/runner.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,139 +13,895 @@ # limitations under the License. # ============================================================================== -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division +import time -from tensorforce.execution.base_runner import BaseRunner +import numpy as np +from tqdm.auto import tqdm -import time -from six.moves import xrange -import warnings -from inspect import getargspec +from tensorforce import Agent, Environment, TensorforceError, util +from tensorforce.environments import RemoteEnvironment -class Runner(BaseRunner): +class Runner(object): """ - Simple runner for non-realtime single-process execution. + Tensorforce runner utility. + + Args: + agent (specification | Agent object | Agent.load kwargs): Agent specification or object + (note: if passed as object, `agent.close()` is not (!) automatically triggered + as part of `runner.close()`), or keyword arguments to `Agent.load()` in particular + containing `directory`, in all cases argument `environment` is implicitly specified + as the following argument, and argument `parallel_interactions` is either implicitly + specified as `num_parallel` or expected to be at least `num_parallel` + (required). + environment (specification | Environment object): Environment specification or object + (note: if passed as object, `environment.close()` is not (!) automatically + triggered as part of `runner.close()`), where argument `max_episode_timesteps` is + implicitly specified as the following argument + (required, or alternatively `environments`, + invalid for "socket-client" remote mode). + max_episode_timesteps (int > 0): Maximum number of timesteps per episode, overwrites the + environment default if defined + (default: environment default, invalid for + "socket-client" remote mode). + num_parallel (int >= 2): Number of environment instances to execute in parallel, usually + requires argument `remote` to be specified for proper parallel execution unless + vectorizable environment + (default: no parallel execution, implicitly + specified by `environments`). + environments (list[specification | Environment object]): Environment specifications or + objects to execute in parallel, the latter are not closed automatically as part of + `runner.close()` + (default: no parallel execution, + alternatively specified via `environment` and `num_parallel`, invalid for + "socket-client" remote mode). + evaluation (bool): Whether to run the last of multiple parallel environments in evaluation + mode, only valid with `num_parallel` or `environments` + (default: no evaluation). + remote ("multiprocessing" | "socket-client"): Communication mode for remote environment + execution of parallelized environment execution, not compatible with environment(s) + given as Environment objects, "socket-client" mode requires a corresponding + "socket-server" running + (default: local execution). + blocking (bool): Whether remote environment calls should be blocking, only valid if remote + mode given + (default: not blocking, invalid unless + "multiprocessing" or "socket-client" remote mode). + host (str, iter[str]): Socket server hostname(s) or IP address(es) + (required only for "socket-client" remote + mode). + port (int, iter[int]): Socket server port(s), increasing sequence if single host and port + given + (required only for "socket-client" remote + mode). """ - def __init__(self, agent, environment, repeat_actions=1, history=None, id_=0): - """ - Initialize a single Runner object (one Agent/one Environment). + def __init__( + self, agent, environment=None, max_episode_timesteps=None, num_parallel=None, + environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None + ): + if environment is None and environments is None: + if remote != 'socket-client': + raise TensorforceError.required( + name='Runner', argument='environment or environments' + ) + if num_parallel is None: + raise TensorforceError.required( + name='Runner', argument='num_parallel', condition='socket-client remote mode' + ) + environments = [None for _ in range(num_parallel)] - Args: - id_ (int): The ID of this Runner (for distributed TF runs). - """ - super(Runner, self).__init__(agent, environment, repeat_actions, history) + elif environment is None: + if environments is None: + raise TensorforceError.required( + name='Runner', argument='environment or environments' + ) + if not util.is_iterable(x=environments): + raise TensorforceError.type( + name='Runner', argument='environments', value=environments + ) + if len(environments) == 0: + raise TensorforceError.value( + name='Runner', argument='len(environments)', value=len(environments) + ) + if num_parallel is not None and num_parallel != len(environments): + raise TensorforceError.value( + name='Runner', argument='num_parallel', value=num_parallel, + hint='!= len(environments)' + ) + num_parallel = len(environments) + environments = list(environments) + + elif num_parallel is None: + if environments is not None: + raise TensorforceError.invalid( + name='Runner', argument='environments', condition='environment is specified' + ) + if evaluation: + raise TensorforceError.invalid( + name='Runner', argument='evaluation', condition='single environment' + ) + num_parallel = 1 + environments = [environment] + + else: + if not isinstance(num_parallel, int): + raise TensorforceError.value( + name='Runner', argument='num_parallel', dtype=type(num_parallel) + ) + elif num_parallel < 2: + raise TensorforceError.value( + name='Runner', argument='num_parallel', value=num_parallel, hint='< 2' + ) + if environments is not None: + raise TensorforceError.invalid( + name='Runner', argument='environments', condition='environment is specified' + ) + if isinstance(environment, Environment): + raise TensorforceError.value( + name='Runner', argument='environment', value=environment, + condition='num_parallel', + hint='is Environment instance, but specification dict is required' + ) + environments = [environment for _ in range(num_parallel)] + + if port is None or isinstance(port, int): + if isinstance(host, str): + port = [port + n for n in range(num_parallel)] + else: + port = [port for _ in range(num_parallel)] + else: + if len(port) != num_parallel: + raise TensorforceError.value( + name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel' + ) + if host is None or isinstance(host, str): + host = [host for _ in range(num_parallel)] + else: + if len(host) != num_parallel: + raise TensorforceError.value( + name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel' + ) - self.id = id_ # the worker's ID in a distributed run (default=0) - self.current_timestep = None # the time step in the current episode + self.environments = list() + self.is_environment_external = isinstance(environments[0], Environment) + environment = Environment.create( + environment=environments[0], max_episode_timesteps=max_episode_timesteps, + remote=remote, blocking=blocking, host=host[0], port=port[0] + ) + self.is_environment_remote = isinstance(environment, RemoteEnvironment) + states = environment.states() + actions = environment.actions() + self.environments.append(environment) + if remote is None and num_parallel > 1 and environment.is_vectorizable(): + self.num_vectorized = num_parallel + environments = environments[:1] + if evaluation: + raise TensorforceError.invalid( + name='Runner', argument='evaluation', condition='vectorized environment' + ) + elif environment.num_actors() > 1: + assert num_parallel == 1 + num_parallel = environment.num_actors() + self.num_vectorized = environment.num_actors() + else: + self.num_vectorized = None + + for n, environment in enumerate(environments[1:], start=1): + assert isinstance(environment, Environment) == self.is_environment_external + environment = Environment.create( + environment=environment, max_episode_timesteps=max_episode_timesteps, + remote=remote, blocking=blocking, host=host[n], port=port[n] + ) + assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote + assert util.is_equal(x=environment.states(), y=states) + assert util.is_equal(x=environment.actions(), y=actions) + self.environments.append(environment) + + self.evaluation = evaluation + + self.is_agent_external = isinstance(agent, Agent) + if not self.is_agent_external and 'directory' in agent: + self.agent = Agent.load( + **agent, environment=environment, + parallel_interactions=(num_parallel - int(self.evaluation)) + ) + elif num_parallel - int(self.evaluation) > 1: + self.agent = Agent.create( + agent=agent, environment=environment, + parallel_interactions=(num_parallel - int(self.evaluation)) + ) + else: + self.agent = Agent.create(agent=agent, environment=environment) def close(self): - self.agent.close() - self.environment.close() + if hasattr(self, 'tqdm'): + self.tqdm.close() + if not self.is_agent_external: + self.agent.close() + if not self.is_environment_external: + for environment in self.environments: + environment.close() # TODO: make average reward another possible criteria for runner-termination - def run(self, num_timesteps=None, num_episodes=None, max_episode_timesteps=None, deterministic=False, - episode_finished=None, summary_report=None, summary_interval=None, timesteps=None, episodes=None - ): + def run( + self, + # General + num_episodes=None, num_timesteps=None, num_updates=None, + # Parallel + batch_agent_calls=False, sync_timesteps=False, sync_episodes=False, num_sleep_secs=0.001, + # Callback + callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, + # Tqdm + use_tqdm=True, mean_horizon=1, + # Evaluation + evaluation=False, save_best_agent=None, evaluation_callback=None + ): """ + Run experiment. + Args: - timesteps (int): Deprecated; see num_timesteps. - episodes (int): Deprecated; see num_episodes. + num_episodes (int > 0): Number of episodes to run experiment, sum of episodes across all + parallel/vectorized environment(s) / actors in a multi-actor environment + (default: no episode limit). + num_timesteps (int > 0): Number of timesteps to run experiment, sum of timesteps across + all parallel/vectorized environment(s) / actors in a multi-actor environment + (default: no timestep limit). + num_updates (int > 0): Number of agent updates to run experiment + (default: no update limit). + batch_agent_calls (bool): Whether to batch agent calls for parallel environment + execution + (default: false, separate call per + environment). + sync_timesteps (bool): Whether to synchronize parallel environment execution on + timestep-level, implied by batch_agent_calls + (default: false, unless + batch_agent_calls is true). + sync_episodes (bool): Whether to synchronize parallel environment execution on + episode-level + (default: false). + num_sleep_secs (float): Sleep duration if no environment is ready + (default: one milliseconds). + callback (callable[(Runner, parallel) -> bool]): Callback function taking the runner + instance plus parallel index and returning a boolean value indicating whether + execution should continue + (default: callback always true). + callback_episode_frequency (int): Episode interval between callbacks + (default: every episode). + callback_timestep_frequency (int): Timestep interval between callbacks + (default: not specified). + use_tqdm (bool): Whether to display a tqdm progress bar for the experiment run + (default: true), with the following + additional information (averaged over number of episodes given via mean_horizon): +
                  +
                • return – cumulative episode return
                • +
                • std – standard deviation of cumulative episode return
                • +
                • ts/ep – timesteps per episode
                • +
                • sec/ep – seconds per episode
                • +
                • ms/ts – milliseconds per timestep
                • +
                • agent – percentage of time spent on agent computation
                • +
                • comm – if remote environment execution, percentage of time spent on + communication
                • +
                + mean_horizon (int): Number of episodes progress bar values and evaluation score are + averaged over (default: not averaged). + evaluation (bool): Whether to run in evaluation mode, only valid if single environment + (default: no evaluation). + save_best_agent (string): Directory to save the best version of the agent according to + the evaluation score + (default: best agent is not saved). + evaluation_callback (int | callable[Runner -> float]): Callback function taking the + runner instance and returning an evaluation score + (default: cumulative evaluation return + averaged over mean_horizon episodes). """ + # General + if num_episodes is None: + self.num_episodes = float('inf') + else: + self.num_episodes = num_episodes + if num_timesteps is None: + self.num_timesteps = float('inf') + else: + self.num_timesteps = num_timesteps + if num_updates is None: + self.num_updates = float('inf') + else: + self.num_updates = num_updates + + # Parallel + if len(self.environments) == 1: + condition = 'single environment' + elif self.num_vectorized is not None: + condition = 'vectorized environment' + else: + condition = None + if condition is None: + pass + elif batch_agent_calls: + raise TensorforceError.invalid( + name='Runner.run', argument='batch_agent_calls', condition=condition + ) + elif sync_timesteps: + raise TensorforceError.invalid( + name='Runner.run', argument='sync_timesteps', condition=condition + ) + elif sync_episodes: + raise TensorforceError.invalid( + name='Runner.run', argument='sync_episodes', condition=condition + ) + self.batch_agent_calls = batch_agent_calls or (self.num_vectorized is not None) + self.sync_timesteps = sync_timesteps or self.batch_agent_calls + self.sync_episodes = sync_episodes or (self.num_vectorized is not None) + self.num_sleep_secs = num_sleep_secs + if self.num_vectorized is None: + self.num_environments = len(self.environments) + else: + self.num_environments = self.num_vectorized + + # Callback + assert callback_episode_frequency is None or callback_timestep_frequency is None + if callback_episode_frequency is None and callback_timestep_frequency is None: + callback_episode_frequency = 1 + if callback_episode_frequency is None: + self.callback_episode_frequency = float('inf') + else: + self.callback_episode_frequency = callback_episode_frequency + if callback_timestep_frequency is None: + self.callback_timestep_frequency = float('inf') + else: + self.callback_timestep_frequency = callback_timestep_frequency + if callback is None: + self.callback = (lambda r, p: True) + elif util.is_iterable(x=callback): + def sequential_callback(runner, parallel): + result = True + for fn in callback: + x = fn(runner, parallel) + if isinstance(result, bool): + result = result and x + return result + self.callback = sequential_callback + else: + def boolean_callback(runner, parallel): + result = callback(runner, parallel) + if isinstance(result, bool): + return result + else: + return True + self.callback = boolean_callback + + # Experiment statistics + self.episode_returns = list() + self.episode_timesteps = list() + self.episode_seconds = list() + self.episode_agent_seconds = list() + if self.is_environment_remote: + self.episode_env_seconds = list() + if self.evaluation or evaluation: + self.evaluation_returns = list() + self.evaluation_timesteps = list() + self.evaluation_seconds = list() + self.evaluation_agent_seconds = list() + if self.is_environment_remote: + self.evaluation_env_seconds = list() + if self.num_environments == 1: + # for tqdm + self.episode_returns = self.evaluation_returns + self.episode_timesteps = self.evaluation_timesteps + self.episode_seconds = self.evaluation_seconds + self.episode_agent_seconds = self.evaluation_agent_seconds + if self.is_environment_remote: + self.episode_env_seconds = self.evaluation_env_seconds + else: + # for tqdm + self.evaluation_returns = self.episode_returns + self.evaluation_timesteps = self.episode_timesteps + self.evaluation_seconds = self.episode_seconds + self.evaluation_agent_seconds = self.episode_agent_seconds + if self.is_environment_remote: + self.evaluation_env_seconds = self.episode_env_seconds + + # Timestep/episode/update counter + self.timesteps = 0 + self.episodes = 0 + self.updates = 0 + + # Tqdm + if use_tqdm: + if hasattr(self, 'tqdm'): + self.tqdm.close() - # deprecation warnings - if timesteps is not None: - num_timesteps = timesteps - warnings.warn("WARNING: `timesteps` parameter is deprecated, use `num_timesteps` instead.", - category=DeprecationWarning) - if episodes is not None: - num_episodes = episodes - warnings.warn("WARNING: `episodes` parameter is deprecated, use `num_episodes` instead.", - category=DeprecationWarning) + assert self.num_episodes != float('inf') or self.num_timesteps != float('inf') + inner_callback = self.callback - # figure out whether we are using the deprecated way of "episode_finished" reporting - old_episode_finished = False - if episode_finished is not None and len(getargspec(episode_finished).args) == 1: - old_episode_finished = True + if self.num_episodes != float('inf'): + # Episode-based tqdm (default option if both num_episodes and num_timesteps set) + assert self.num_episodes != float('inf') + bar_format = ( + '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, return={postfix[0]:.2f}, stddev={postfix[1]:.2f}, ts/ep=' + '{postfix[2]}, sec/ep={postfix[3]:.2f}, ms/ts={postfix[4]:.1f}, agent=' + '{postfix[5]:.1f}%]' + ) + postfix = [0.0, 0.0, 0, 0.0, 0.0, 0.0] + if self.is_environment_remote: + bar_format = bar_format[:-1] + ', comm={postfix[6]:.1f}%]' + postfix.append(0.0) - # Keep track of episode reward and episode length for statistics. - self.start_time = time.time() + self.tqdm = tqdm( + desc='Episodes', total=self.num_episodes, bar_format=bar_format, + initial=self.episodes, postfix=postfix + ) + self.tqdm_last_update = self.episodes + def tqdm_callback(runner, parallel): + if len(runner.evaluation_returns) > 0: + mean_return = float(np.mean(runner.evaluation_returns[-mean_horizon:])) + std_return = float(np.std(runner.evaluation_returns[-mean_horizon:])) + runner.tqdm.postfix[0] = mean_return + runner.tqdm.postfix[1] = std_return + if len(runner.episode_timesteps) > 0: + mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:])) + mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:])) + mean_agent_sec = float( + np.mean(runner.episode_agent_seconds[-mean_horizon:]) + ) + try: + mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep + except ZeroDivisionError: + mean_ms_per_ts = 0.0 + try: + mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep + except ZeroDivisionError: + mean_rel_agent = 0.0 + runner.tqdm.postfix[2] = mean_ts_per_ep + runner.tqdm.postfix[3] = mean_sec_per_ep + runner.tqdm.postfix[4] = mean_ms_per_ts + runner.tqdm.postfix[5] = mean_rel_agent + if runner.is_environment_remote and len(runner.episode_env_seconds) > 0: + mean_env_sec = float(np.mean(runner.episode_env_seconds[-mean_horizon:])) + mean_rel_comm = (mean_agent_sec + mean_env_sec) * 100.0 / mean_sec_per_ep + mean_rel_comm = 100.0 - mean_rel_comm + runner.tqdm.postfix[6] = mean_rel_comm + runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) + runner.tqdm_last_update = runner.episodes + return inner_callback(runner, parallel) + + else: + # Timestep-based tqdm + self.tqdm = tqdm( + desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, + postfix=dict(mean_return='n/a') + ) + self.tqdm_last_update = self.timesteps + + def tqdm_callback(runner, parallel): + # sum_timesteps_return = sum(runner.timestep_returns[num_mean_return:]) + # num_timesteps = min(num_mean_return, runner.evaluation_timestep) + # mean_return = sum_timesteps_return / num_episodes + runner.tqdm.set_postfix(mean_return='n/a') + runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) + runner.tqdm_last_update = runner.timesteps + return inner_callback(runner, parallel) + + self.callback = tqdm_callback + + # Evaluation + if evaluation and self.num_environments > 1: + raise TensorforceError.invalid( + name='Runner.run', argument='evaluation', condition='parallel environments' + ) + self.evaluation_run = self.evaluation or evaluation + self.save_best_agent = save_best_agent + if evaluation_callback is None: + self.evaluation_callback = (lambda r: None) + else: + self.evaluation_callback = evaluation_callback + if self.save_best_agent is not None: + inner_evaluation_callback = self.evaluation_callback + + def mean_return_callback(runner): + result = inner_evaluation_callback(runner) + if result is None: + return float(np.mean(runner.evaluation_returns[-mean_horizon:])) + else: + return result + + self.evaluation_callback = mean_return_callback + self.best_evaluation_score = None + + # Episode statistics + self.episode_return = [0.0 for _ in range(self.num_environments)] + self.episode_timestep = [0 for _ in range(self.num_environments)] + # if self.batch_agent_calls: + # self.episode_agent_second = 0.0 + # self.episode_start = time.time() + if self.evaluation_run: + self.episode_agent_second = [0.0 for _ in range(self.num_environments - 1)] + self.episode_start = [time.time() for _ in range(self.num_environments - 1)] + else: + self.episode_agent_second = [0.0 for _ in range(self.num_environments)] + self.episode_start = [time.time() for _ in range(self.num_environments)] + self.evaluation_agent_second = 0.0 + self.evaluation_start = time.time() + + # Values + self.terminate = 0 + self.prev_terminals = [-1 for _ in range(self.num_environments)] + self.states = [None for _ in range(self.num_environments)] + self.terminals = [None for _ in range(self.num_environments)] + self.rewards = [None for _ in range(self.num_environments)] + if self.evaluation_run: + self.evaluation_internals = self.agent.initial_internals() + + # Required if agent was previously stopped mid-episode self.agent.reset() - if num_episodes is not None: - num_episodes += self.agent.episode - - if num_timesteps is not None: - num_timesteps += self.agent.timestep - - # episode loop - while True: - episode_start_time = time.time() - state = self.environment.reset() - self.agent.reset() - - # Update global counters. - self.global_episode = self.agent.episode # global value (across all agents) - self.global_timestep = self.agent.timestep # global value (across all agents) - - episode_reward = 0 - self.current_timestep = 0 - - # time step (within episode) loop - while True: - action = self.agent.act(states=state, deterministic=deterministic) - - reward = 0 - for repeat in xrange(self.repeat_actions): - state, terminal, step_reward = self.environment.execute(actions=action) - reward += step_reward - if terminal: - break - - if max_episode_timesteps is not None and self.current_timestep >= max_episode_timesteps: - terminal = True - - self.agent.observe(terminal=terminal, reward=reward) - - self.global_timestep += 1 - self.current_timestep += 1 - episode_reward += reward - - if terminal or self.agent.should_stop(): # TODO: should_stop also terminate? - break - - # Update our episode stats. - time_passed = time.time() - episode_start_time - self.episode_rewards.append(episode_reward) - self.episode_timesteps.append(self.current_timestep) - self.episode_times.append(time_passed) - - self.global_episode += 1 - - # Check, whether we should stop this run. - if episode_finished is not None: - # deprecated way (passing in only runner object): - if old_episode_finished: - if not episode_finished(self): - break - # new unified way (passing in BaseRunner AND some worker ID): - elif not episode_finished(self, self.id): - break - if (num_episodes is not None and self.global_episode >= num_episodes) or \ - (num_timesteps is not None and self.global_timestep >= num_timesteps) or \ - self.agent.should_stop(): - break - - # keep backwards compatibility - @property - def episode_timestep(self): - return self.current_timestep - - -# more descriptive alias for Runner class -DistributedTFRunner = Runner -SingleRunner = Runner + # Reset environments + if self.num_vectorized is None: + for environment in self.environments: + environment.start_reset() + else: + if self.environments[0].is_vectorizable(): + parallel, states = self.environments[0].reset(num_parallel=self.num_vectorized) + else: + parallel, states = self.environments[0].reset() + for i, n in enumerate(parallel): + self.states[n] = states[i] + self.prev_terminals[n] = -2 + + # Runner loop + while any(terminal <= 0 for terminal in self.prev_terminals): + self.terminals = [None for _ in self.terminals] + + if self.batch_agent_calls: + + if self.num_vectorized is None: + # Retrieve observations (only if not already terminated) + while any(terminal is None for terminal in self.terminals): + for n in range(self.num_environments): + if self.terminals[n] is not None: + # Already received + continue + elif self.prev_terminals[n] <= 0: + # Receive if not terminal + observation = self.environments[n].receive_execute() + if observation is None: + continue + self.states[n], self.terminals[n], self.rewards[n] = observation + else: + # Terminal + self.states[n] = None + self.terminals[n] = self.prev_terminals[n] + self.rewards[n] = None + + else: + # Vectorized environment execute + if all(terminal >= -1 for terminal in self.prev_terminals): + parallel, states, terminals, rewards = self.environments[0].execute( + actions=np.asarray(self.actions) + ) + i = 0 + for n, terminal in enumerate(self.prev_terminals): + if terminal <= 0: + self.terminals[n] = terminals[i] + self.rewards[n] = rewards[i] + if terminals[i] > 0: + self.states[n] = None + i += 1 + else: + self.states[n] = None + self.terminals[n] = self.prev_terminals[n] + self.rewards[n] = None + for i, n in enumerate(parallel): + assert self.terminals[n] <= 0 or self.terminals[n] == 2 + self.states[n] = states[i] + else: + for n, terminal in enumerate(self.prev_terminals): + if terminal < -1: + self.terminals[n] = -1 + else: + self.terminals[n] = self.prev_terminals[n] + + self.handle_observe_joint() + self.handle_act_joint() + + # Parallel environments loop + no_environment_ready = True + for n in range(self.num_environments): + + if self.prev_terminals[n] > 0: + # Continue if episode terminated (either sync_episodes or finished) + self.terminals[n] = self.prev_terminals[n] + continue + + elif self.batch_agent_calls: + # Handled before parallel environments loop + pass + + elif self.sync_timesteps: + # Wait until environment is ready + while True: + observation = self.environments[n].receive_execute() + if observation is not None: + break + + else: + # Check whether environment is ready, otherwise continue + observation = self.environments[n].receive_execute() + if observation is None: + self.terminals[n] = self.prev_terminals[n] + continue + + no_environment_ready = False + if not self.batch_agent_calls: + self.states[n], self.terminals[n], self.rewards[n] = observation + + # Check whether evaluation environment + if self.evaluation_run and n == self.num_environments - 1: + if self.terminals[n] == -1: + # Initial act + self.handle_act_evaluation() + else: + # Observe + self.handle_observe_evaluation() + if self.terminals[n] == 0: + # Act + self.handle_act_evaluation() + else: + # Terminal + self.handle_terminal_evaluation() + + else: + if self.terminals[n] == -1: + # Initial act + self.handle_act(parallel=n) + else: + # Observe + self.handle_observe(parallel=n) + if self.terminals[n] == 0: + # Act + self.handle_act(parallel=n) + else: + # Terminal + self.handle_terminal(parallel=n) + + self.prev_terminals = list(self.terminals) + + # Sync_episodes: Reset if all episodes terminated + if self.sync_episodes and all(terminal > 0 for terminal in self.terminals): + num_episodes_left = self.num_episodes - self.episodes + if self.num_vectorized is None: + num_noneval_environments = self.num_environments - int(self.evaluation_run) + for n in range(min(num_noneval_environments, num_episodes_left)): + self.prev_terminals[n] = -1 + self.environments[n].start_reset() + if self.evaluation_run and num_episodes_left > 0: + self.prev_terminals[-1] = -1 + self.environments[-1].start_reset() + elif num_episodes_left > 0: + if self.environments[0].is_vectorizable(): + parallel, states = self.environments[0].reset( + num_parallel=min(num_episodes_left, self.num_vectorized) + ) + else: + parallel, states = self.environments[0].reset() + for i, n in enumerate(parallel): + self.states[n] = states[i] + self.prev_terminals[n] = -2 + else: + self.prev_terminals = list() + + # Sleep if no environment was ready + if no_environment_ready: + time.sleep(self.num_sleep_secs) + + def handle_act(self, parallel): + if self.batch_agent_calls: + if self.num_vectorized is None: + self.environments[parallel].start_execute(actions=self.actions[parallel]) + + else: + agent_start = time.time() + actions = self.agent.act(states=self.states[parallel], parallel=parallel) + self.episode_agent_second[parallel] += time.time() - agent_start + + self.environments[parallel].start_execute(actions=actions) + + # Update episode statistics + self.episode_timestep[parallel] += 1 + + # Maximum number of timesteps or timestep callback (after counter increment!) + self.timesteps += 1 + if (( + self.episode_timestep[parallel] % self.callback_timestep_frequency == 0 and + not self.callback(self, parallel) + ) or self.timesteps >= self.num_timesteps): + self.terminate = 2 + + def handle_act_joint(self): + parallel = [ + n for n in range(self.num_environments - int(self.evaluation_run)) + if self.terminals[n] <= 0 + ] + if len(parallel) > 0: + agent_start = time.time() + self.actions = self.agent.act( + states=[self.states[p] for p in parallel], parallel=parallel + ) + agent_second = (time.time() - agent_start) / len(parallel) + for p in parallel: + self.episode_agent_second[p] += agent_second + if self.num_vectorized is None: + self.actions = [ + self.actions[parallel.index(n)] if n in parallel else None + for n in range(self.num_environments) + ] + + if self.evaluation_run and self.terminals[-1] <= 0: + assert self.num_vectorized is None + agent_start = time.time() + self.actions[-1], self.evaluation_internals = self.agent.act( + states=self.states[-1], internals=self.evaluation_internals, independent=True, + deterministic=True + ) + self.episode_agent_second[-1] += time.time() - agent_start + + def handle_act_evaluation(self): + if self.batch_agent_calls: + assert self.num_vectorized is None + actions = self.actions[-1] + + else: + agent_start = time.time() + actions, self.evaluation_internals = self.agent.act( + states=self.states[-1], internals=self.evaluation_internals, independent=True, + deterministic=True + ) + self.evaluation_agent_second += time.time() - agent_start + + self.environments[-1].start_execute(actions=actions) + + # Update episode statistics + self.episode_timestep[-1] += 1 + + # Maximum number of timesteps or timestep callback (after counter increment!) + if self.evaluation_run and self.num_environments == 1: + self.timesteps += 1 + if (( + self.episode_timestep[-1] % self.callback_timestep_frequency == 0 and + not self.callback(self, -1) + ) or self.timesteps >= self.num_timesteps): + self.terminate = 2 + + def handle_observe(self, parallel): + # Update episode statistics + self.episode_return[parallel] += self.rewards[parallel] + + # Not terminal but finished + if self.terminals[parallel] == 0 and self.terminate == 2: + self.terminals[parallel] = 2 + + # Observe unless batch_agent_calls + if not self.batch_agent_calls: + agent_start = time.time() + updated = self.agent.observe( + terminal=self.terminals[parallel], reward=self.rewards[parallel], parallel=parallel + ) + self.episode_agent_second[parallel] += time.time() - agent_start + self.updates += int(updated) + + # Maximum number of updates (after counter increment!) + if self.updates >= self.num_updates: + self.terminate = 2 + + def handle_observe_joint(self): + parallel = [ + n for n in range(self.num_environments - int(self.evaluation_run)) + if self.prev_terminals[n] <= 0 and self.terminals[n] >= 0 + ] + if len(parallel) > 0: + agent_start = time.time() + updated = self.agent.observe( + terminal=[self.terminals[p] for p in parallel], + reward=[self.rewards[p] for p in parallel], parallel=parallel + ) + agent_second = (time.time() - agent_start) / len(parallel) + for p in parallel: + self.episode_agent_second[p] += agent_second + self.updates += updated + + def handle_observe_evaluation(self): + # Update episode statistics + self.episode_return[-1] += self.rewards[-1] + + # Reset agent if terminal + if self.terminals[-1] > 0 or self.terminate == 2: + agent_start = time.time() + self.evaluation_agent_second += time.time() - agent_start + + def handle_terminal(self, parallel): + # Update experiment statistics + if self.num_vectorized is None: + actual_episode_return = self.environments[parallel].episode_return() + else: + actual_episode_return = self.environments[0].episode_return(parallel=parallel) + if actual_episode_return is None: + self.episode_returns.append(self.episode_return[parallel]) + else: + self.episode_returns.append(actual_episode_return) + self.episode_timesteps.append(self.episode_timestep[parallel]) + self.episode_seconds.append(time.time() - self.episode_start[parallel]) + self.episode_agent_seconds.append(self.episode_agent_second[parallel]) + if self.is_environment_remote: + self.episode_env_seconds.append(self.environments[parallel]._episode_seconds) + + # Maximum number of episodes or episode callback (after counter increment!) + self.episodes += 1 + if self.terminate == 0 and (( + self.episodes % self.callback_episode_frequency == 0 and + not self.callback(self, parallel) + ) or self.episodes >= self.num_episodes): + self.terminate = 1 + + # Reset episode statistics + self.episode_return[parallel] = 0.0 + self.episode_timestep[parallel] = 0 + self.episode_agent_second[parallel] = 0.0 + self.episode_start[parallel] = time.time() + + # Reset environment + if self.terminate == 0 and not self.sync_episodes: + self.terminals[parallel] = -1 + self.environments[parallel].start_reset() + + def handle_terminal_evaluation(self): + # Update experiment statistics + if self.num_vectorized is None: + actual_episode_return = self.environments[-1].episode_return() + else: + actual_episode_return = self.environments[0].episode_return(parallel=-1) + if actual_episode_return is None: + self.evaluation_returns.append(self.episode_return[-1]) + else: + self.evaluation_returns.append(actual_episode_return) + self.evaluation_timesteps.append(self.episode_timestep[-1]) + self.evaluation_seconds.append(time.time() - self.evaluation_start) + self.evaluation_agent_seconds.append(self.evaluation_agent_second) + if self.is_environment_remote: + self.evaluation_env_seconds.append(self.environments[-1]._episode_seconds) + + # Evaluation callback + if self.save_best_agent is not None: + evaluation_score = self.evaluation_callback(self) + assert isinstance(evaluation_score, float) + if self.best_evaluation_score is None: + self.best_evaluation_score = evaluation_score + elif evaluation_score > self.best_evaluation_score: + self.best_evaluation_score = evaluation_score + self.agent.save( + directory=self.save_best_agent, filename='best-model', append=None + ) + else: + self.evaluation_callback(self) + + # Maximum number of episodes or episode callback (after counter increment!) + if self.evaluation_run and self.num_environments == 1: + self.episodes += 1 + if self.terminate == 0 and (( + self.episodes % self.callback_episode_frequency == 0 and + not self.callback(self, -1) + ) or self.episodes >= self.num_episodes): + self.terminate = 1 + + # Reset episode statistics + self.episode_return[-1] = 0.0 + self.episode_timestep[-1] = 0 + self.evaluation_agent_second = 0.0 + self.evaluation_start = time.time() + + # Reset environment + if self.terminate == 0 and not self.sync_episodes: + self.terminals[-1] = 0 + self.environments[-1].start_reset() + self.evaluation_internals = self.agent.initial_internals() diff --git a/tensorforce/execution/threaded_runner.py b/tensorforce/execution/threaded_runner.py deleted file mode 100644 index a443d83e3..000000000 --- a/tensorforce/execution/threaded_runner.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import importlib -from inspect import getargspec -from six.moves import xrange -import threading -import time -import warnings - -from tensorforce import TensorForceError -from tensorforce.execution.base_runner import BaseRunner -from tensorforce.agents.learning_agent import LearningAgent -from tensorforce.agents import agents as AgentsDictionary - - -class ThreadedRunner(BaseRunner): - """ - Runner for non-realtime threaded execution of multiple agents. - """ - - def __init__(self, agent, environment, repeat_actions=1, save_path=None, save_episodes=None, save_frequency=None, - save_frequency_unit=None, agents=None, environments=None): - """ - Initialize a ThreadedRunner object. - - Args: - save_path (str): Path where to save the shared model. - save_episodes (int): Deprecated: Every how many (global) episodes do we save the shared model? - save_frequency (int): The frequency with which to save the model (could be sec, steps, or episodes). - save_frequency_unit (str): "s" (sec), "t" (timesteps), "e" (episodes) - agents (List[Agent]): Deprecated: List of Agent objects. Use `agent`, instead. - environments (List[Environment]): Deprecated: List of Environment objects. Use `environment`, instead. - """ - if agents is not None: - warnings.warn("WARNING: `agents` parameter is deprecated, use `agent` instead.", - category=DeprecationWarning) - agent = agents - if environments is not None: - warnings.warn("WARNING: `environments` parameter is deprecated, use `environments` instead.", - category=DeprecationWarning) - environment = environments - super(ThreadedRunner, self).__init__(agent, environment, repeat_actions) - - if len(agent) != len(environment): - raise TensorForceError("Each agent must have its own environment. Got {a} agents and {e} environments.". - format(a=len(self.agent), e=len(self.environment))) - self.save_path = save_path - self.save_episodes = save_episodes - if self.save_episodes is not None: - warnings.warn("WARNING: `save_episodes` parameter is deprecated, use `save_frequency` AND " - "`save_frequency_unit` instead.", - category=DeprecationWarning) - self.save_frequency = self.save_episodes - self.save_frequency_unit = "e" - else: - self.save_frequency = save_frequency - self.save_frequency_unit = save_frequency_unit - - # Initialize stats for parallel runs. - self.episode_list_lock = threading.Lock() - # Stop-condition flag that each worker abides to (aborts if True). - self.should_stop = False - # Global time counter (sec). - self.time = None - - def close(self): - self.agent[0].close() # only close first agent as we just have one shared model - for e in self.environment: - e.close() - - def run( - self, - num_episodes=-1, - max_episode_timesteps=-1, - episode_finished=None, - summary_report=None, - summary_interval=0, - num_timesteps=None, - deterministic=False, - episodes=None, - max_timesteps=None - ): - """ - Executes this runner by starting all Agents in parallel (each one in one thread). - - Args: - episodes (int): Deprecated; see num_episodes. - max_timesteps (int): Deprecated; see max_episode_timesteps. - """ - - # Renamed episodes into num_episodes to match BaseRunner's signature (fully backw. compatible). - if episodes is not None: - num_episodes = episodes - warnings.warn("WARNING: `episodes` parameter is deprecated, use `num_episodes` instead.", - category=DeprecationWarning) - assert isinstance(num_episodes, int) - # Renamed max_timesteps into max_episode_timesteps to match single Runner's signature (fully backw. compatible). - if max_timesteps is not None: - max_episode_timesteps = max_timesteps - warnings.warn("WARNING: `max_timesteps` parameter is deprecated, use `max_episode_timesteps` instead.", - category=DeprecationWarning) - assert isinstance(max_episode_timesteps, int) - - if summary_report is not None: - warnings.warn("WARNING: `summary_report` parameter is deprecated, use `episode_finished` callback " - "instead to generate summaries every n episodes.", - category=DeprecationWarning) - - self.reset() - - # Reset counts/stop-condition for this run. - self.global_episode = 0 - self.global_timestep = 0 - self.should_stop = False - - # Create threads. - threads = [threading.Thread(target=self._run_single, args=(t, self.agent[t], self.environment[t],), - kwargs={"deterministic": deterministic, - "max_episode_timesteps": max_episode_timesteps, - "episode_finished": episode_finished}) - for t in range(len(self.agent))] - - # Start threads. - self.start_time = time.time() - [t.start() for t in threads] - - # Stay idle until killed by SIGINT or a global stop condition is met. - try: - next_summary = 0 - next_save = 0 if self.save_frequency_unit != "s" else time.time() - while any([t.is_alive() for t in threads]) and self.global_episode < num_episodes or num_episodes == -1: - self.time = time.time() - - # This is deprecated (but still supported) and should be covered by the `episode_finished` callable. - if summary_report is not None and self.global_episode > next_summary: - summary_report(self) - next_summary += summary_interval - - if self.save_path and self.save_frequency is not None: - do_save = True - current = None - if self.save_frequency_unit == "e" and self.global_episode > next_save: - current = self.global_episode - elif self.save_frequency_unit == "s" and self.time > next_save: - current = self.time - elif self.save_frequency_unit == "t" and self.global_timestep > next_save: - current = self.global_timestep - else: - do_save = False - - if do_save: - self.agent[0].save_model(self.save_path) - # Make sure next save is later than right now. - while next_save < current: - next_save += self.save_frequency - time.sleep(1) - - except KeyboardInterrupt: - print('Keyboard interrupt, sending stop command to threads') - - self.should_stop = True - - # Join threads. - [t.join() for t in threads] - print('All threads stopped') - - def _run_single(self, thread_id, agent, environment, deterministic=False, - max_episode_timesteps=-1, episode_finished=None): - """ - The target function for a thread, runs an agent and environment until signaled to stop. - Adds rewards to shared episode rewards list. - - Args: - thread_id (int): The ID of the thread that's running this target function. - agent (Agent): The Agent object that this particular thread uses. - environment (Environment): The Environment object that this particular thread uses. - max_episode_timesteps (int): Max. number of timesteps per episode. Use -1 or 0 for non-limited episodes. - episode_finished (callable): Function called after each episode that takes an episode summary spec and - returns False, if this single run should terminate after this episode. - Can be used e.g. to set a particular mean reward threshold. - """ - - # figure out whether we are using the deprecated way of "episode_finished" reporting - old_episode_finished = False - if episode_finished is not None and len(getargspec(episode_finished).args) == 1: - old_episode_finished = True - - episode = 0 - # Run this single worker (episode loop) as long as global count thresholds have not been reached. - while not self.should_stop: - state = environment.reset() - agent.reset() - self.global_timestep, self.global_episode = agent.timestep, agent.episode - episode_reward = 0 - - # Time step (within episode) loop - time_step = 0 - time_start = time.time() - while True: - action = agent.act(states=state, deterministic=deterministic) - reward = 0 - for repeat in xrange(self.repeat_actions): - state, terminal, step_reward = environment.execute(actions=action) - reward += step_reward - if terminal: - break - - agent.observe(reward=reward, terminal=terminal) - - time_step += 1 - episode_reward += reward - - if terminal or time_step == max_episode_timesteps: - break - - # Abort the episode (discard its results) when global says so. - if self.should_stop: - return - - self.global_timestep += time_step - - # Avoid race condition where order in episode_rewards won't match order in episode_timesteps. - self.episode_list_lock.acquire() - self.episode_rewards.append(episode_reward) - self.episode_timesteps.append(time_step) - self.episode_times.append(time.time() - time_start) - self.episode_list_lock.release() - - if episode_finished is not None: - # old way of calling episode_finished - if old_episode_finished: - summary_data = { - "thread_id": thread_id, - "episode": episode, - "timestep": time_step, - "episode_reward": episode_reward - } - if not episode_finished(summary_data): - return - # New way with BasicRunner (self) and thread-id. - elif not episode_finished(self, thread_id): - return - - episode += 1 - - # Backwards compatibility for deprecated properties (in case someone directly references these). - @property - def agents(self): - return self.agent - - @property - def environments(self): - return self.environment - - @property - def episode_lengths(self): - return self.episode_timesteps - - @property - def global_step(self): - return self.global_timestep - - -def WorkerAgentGenerator(agent_class): - """ - Worker Agent generator, receives an Agent class and creates a Worker Agent class that inherits from that Agent. - """ - - # Support special case where class is given as type-string (AgentsDictionary) or class-name-string. - if isinstance(agent_class, str): - agent_class = AgentsDictionary.get(agent_class) - # Last resort: Class name given as string? - if not agent_class and agent_class.find('.') != -1: - module_name, function_name = agent_class.rsplit('.', 1) - module = importlib.import_module(module_name) - agent_class = getattr(module, function_name) - - class WorkerAgent(agent_class): - """ - Worker agent receiving a shared model to avoid creating multiple models. - """ - - def __init__(self, model=None, **kwargs): - # Set our model externally. - self.model = model - # Be robust against `network` coming in from kwargs even though this agent doesn't have one - if not issubclass(agent_class, LearningAgent): - kwargs.pop("network") - # Call super c'tor (which will call initialize_model and assign self.model to the return value). - super(WorkerAgent, self).__init__(**kwargs) - - def initialize_model(self): - # Return our model (already given and initialized). - return self.model - - return WorkerAgent - - -def clone_worker_agent(agent, factor, environment, network, agent_config): - """ - Clones a given Agent (`factor` times) and returns a list of the cloned Agents with the original Agent - in the first slot. - - Args: - agent (Agent): The Agent object to clone. - factor (int): The length of the final list. - environment (Environment): The Environment to use for all cloned agents. - network (LayeredNetwork): The Network to use (or None) for an Agent's Model. - agent_config (dict): A dict of Agent specifications passed into the Agent's c'tor as kwargs. - Returns: - The list with `factor` cloned agents (including the original one). - """ - ret = [agent] - for i in xrange(factor - 1): - worker = WorkerAgentGenerator(type(agent))( - states=environment.states, - actions=environment.actions, - network=network, - model=agent.model, - **agent_config - ) - ret.append(worker) - - return ret diff --git a/tensorforce/meta_parameter_recorder.py b/tensorforce/meta_parameter_recorder.py deleted file mode 100755 index 9e6c48dbb..000000000 --- a/tensorforce/meta_parameter_recorder.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import inspect -import os -import numpy as np -import tensorflow as tf -from tensorforce import TensorForceError - - -class MetaParameterRecorder(object): - """ - Class to record MetaParameters as well as Summary/Description for TensorBoard (TEXT & FILE will come later) - - General: - - * format_type: used to configure data conversion for TensorBoard=0, TEXT & JSON (not Implemented), etc - """ - - def __init__(self, current_frame): - """ - Init the MetaPrameterRecord with "Agent" parameters by passing inspect.currentframe() from Agent Class - - The Init will search back to find the parent class to capture all passed parameters and store - them in "self.meta_params". - - NOTE: Currently only optimized for TensorBoard output - - TODO: Add JSON Export, TEXT EXPORT - - Args: - current_frame: frame value from class to obtain metaparameters[= inspect.currentframe()] - - """ - self.ignore_unknown_dtypes = False - self.meta_params = dict() - self.method_calling = inspect.getframeinfo(current_frame)[2] - - _, _, __, self.vals_current = inspect.getargvalues(current_frame) - # self is the class name of the frame involved - if 'self' in self.vals_current: - self.recorded_class_type = self.vals_current['self'] - # Add explicit AgentName item so class can be deleted - self.meta_params['AgentName'] = str(self.vals_current['self']) - - frame_list = inspect.getouterframes(current_frame) - - for frame in frame_list: - # Rather than frame.frame (named tuple), use [0] for python2 - args, varargs, keywords, vals =inspect.getargvalues(frame[0]) - if 'self' in vals: - if self.recorded_class_type == vals['self']: - for i in args: - self.meta_params[i] = vals[i] - # Remove the "CLASS" from the dictionary, has no value "AgentName" contains STR of Class - del self.meta_params['self'] - - def merge_custom(self, custom_dict): - if type(custom_dict) is not dict: - raise TensorForceError( - "Error: MetaParameterRecorder 'meta_dict' must be passed a dictionary " - "but was passed a type {} which is not supported.".format(str(type(custom_dict))) - ) - for key in custom_dict: - if key in self.meta_params: - raise TensorForceError( - "Error: MetaParameterRecorder 'meta_dict' key {} conflicts with internal key," - " please change passed key.".format(str(key)) - ) - self.meta_params[key] = custom_dict[key] - # This line assumes the merge data came from summary_spec['meta_dict'], remove this from summary_spec - del self.meta_params['summarizer']['meta_dict'] - - def text_output(self, format_type=1): - print('======================= ' + self.meta_params['AgentName'] + ' ====================================') - for key in self.meta_params: - print( - " ", - key, - type(self.meta_params[key]), - "=", - self.convert_data_to_string(self.meta_params[key], format_type=format_type) - ) - - print('======================= ' + self.meta_params['AgentName'] + ' ====================================') - - def convert_dictionary_to_string(self, data, indent=0, format_type=0, separator=None, eol=None): - data_string = "" - add_separator = "" - if eol is None: - eol = os.linesep - if separator is None: - separator = ", " - - # This should not ever occur but here as a catch - if type(data) is not dict: - raise TensorForceError( - "Error: MetaParameterRecorder Dictionary conversion was passed a type {}" - " not supported.".format(str(type(data))) - ) - - # TensorBoard - if format_type == 0: - label = "" - div = "" - - if indent > 0: - label = " | " - div = "--- | " - data_string += label + "Key | Value" + eol + div + "--- | ----" + eol - - for key in data: - key_txt = key - # TensorBoard - if format_type == 0: - key_txt = "**" + key + "**" - key_value_sep = ' | ' - if indent > 0: - key_txt = " | " + key_txt - - converted_data = self.convert_data_to_string(data[key], separator=separator, indent=indent+1) - data_string += add_separator + key_txt + key_value_sep + converted_data + eol - - return data_string - - def convert_list_to_string(self, data, indent=0, format_type=0, eol=None, count=True): - data_string = "" - if eol is None: - eol = os.linesep - - # This should not ever occur but here as a catch - if type(data) is not list: - raise TensorForceError( - "Error: MetaParameterRecorder List conversion was passed a type {}" - " not supported.".format(str(type(data))) - ) - - for index,line in enumerate(data): - data_string_prefix = "" - if count and indent == 0: - data_string_prefix = str(index+1)+". " - # TensorBoard - if format_type == 0: - # Only add indent for 2nd item and beyond as this is likely a dictionary entry - if indent > 0 and index>0: - data_string_prefix = " | "+data_string_prefix - if index == (len(data)-1): - append_eol = "" - else: - append_eol = eol - data_string += data_string_prefix + self.convert_data_to_string(line, indent=indent+1) + append_eol - - return data_string - - def convert_ndarray_to_md(self, data, format_type=0, eol=None): - data_string = "" - data_string1 = "|Row|" - data_string2 = "|:---:|" - if eol is None: - eol = os.linesep - - # This should not ever occur but here as a catch - if type(data) is not np.ndarray: - raise TensorForceError( - "Error: MetaParameterRecorder ndarray conversion was passed" - " a type {} not supported.".format(str(type(data))) - ) - - shape = data.shape - rank = data.ndim - - if rank == 2: - for col in range(shape[1]): - data_string1 += "Col-" + str(col) + "|" - data_string2 += ":----:|" - data_string += data_string1 + eol + data_string2 + eol - - for row in range(shape[0]): - data_string += "|" + str(row) + "|" - for col in range(shape[1]): - data_string += str(data[row,col]) + "|" - - if row != (shape[0]-1): - data_string += eol - - elif rank == 1: - data_string += "|Row|Col-0|" + eol + "|:----:|:----:|" + eol - - for row in range(shape[0]): - data_string += str(row) + "|" + str(data[row]) + "|" + eol - - return data_string - - def convert_data_to_string(self, data, indent=0, format_type=0, separator=None, eol=None): - data_string = "" - if type(data) is int: - data_string = str(data) - elif type(data) is float: - data_string = str(data) - elif type(data) is str: - data_string = data - elif type(data) is tuple: - data_string = str(data) - elif type(data) is list: - data_string = self.convert_list_to_string(data, indent=indent, eol=eol) - elif type(data) is bool: - data_string = str(data) - elif type(data) is dict: - data_string = self.convert_dictionary_to_string(data, indent=indent, separator=separator) - elif type(data) is np.ndarray: - # TensorBoard - if format_type == 0: - data_string = self.convert_ndarray_to_md(data) - else: - data_string = str(data) - elif data is None: - data_string = "None" - else: - if not self.ignore_unknown_dtypes: - data_string = "Error: MetaParameterRecorder Type conversion from type {} not supported.".\ - format(str(type(data))) - data_string += " ("+str(data)+") " - else: - # TensorBoard - if format_type == 0: - data_string = "**?**" - - return data_string - - def build_metagraph_list(self): - """ - Convert MetaParams into TF Summary Format and create summary_op - - Args: - None - - Returns: - Merged TF Op for TEXT summary elements, should only be executed once to reduce data duplication - - """ - ops = [] - - self.ignore_unknown_dtypes = True - for key in sorted(self.meta_params): - value = self.convert_data_to_string(self.meta_params[key]) - - if len(value) == 0: - continue - if isinstance(value, str): - ops.append(tf.summary.text(name=key, tensor=tf.convert_to_tensor(str(value)))) - else: - ops.append(tf.summary.text(name=key, tensor=tf.as_string(tf.convert_to_tensor(value)))) - - with tf.control_dependencies(tf.tuple(ops)): - self.summary_merged = tf.summary.merge_all() - - return self.summary_merged diff --git a/tensorforce/models/__init__.py b/tensorforce/models/__init__.py deleted file mode 100755 index a474c622e..000000000 --- a/tensorforce/models/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorforce.models.model import Model -from tensorforce.models.memory_model import MemoryModel -from tensorforce.models.distribution_model import DistributionModel -from tensorforce.models.pg_model import PGModel -from tensorforce.models.pg_log_prob_model import PGLogProbModel -from tensorforce.models.dpg_target_model import DPGTargetModel -from tensorforce.models.pg_prob_ratio_model import PGProbRatioModel -from tensorforce.models.q_model import QModel -from tensorforce.models.q_nstep_model import QNstepModel -from tensorforce.models.q_naf_model import QNAFModel -from tensorforce.models.q_demo_model import QDemoModel - - -models = dict( - pg_log_prob_model=PGLogProbModel, - pg_log_prob_target_model=DPGTargetModel, - pg_prob_ratio_model=PGProbRatioModel, - q_model=QModel, - q_nstep_model=QNstepModel, - q_naf_model=QNAFModel, - q_demo_model=QDemoModel -) - - -__all__ = [ - 'Model', - 'MemoryModel', - 'DistributionModel', - 'PGModel', - 'PGProbRatioModel', - 'DPGTargetModel', - 'PGLogProbModel', - 'QModel', - 'QNstepModel', - 'QNAFModel', - 'QDemoModel', - 'models' -] diff --git a/tensorforce/models/constant_model.py b/tensorforce/models/constant_model.py deleted file mode 100644 index aa1cb8de4..000000000 --- a/tensorforce/models/constant_model.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce.models import Model - - -class ConstantModel(Model): - """ - Utility class to return constant actions of a desired shape and with given bounds. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - action_values - ): - self.action_values = action_values - - super(ConstantModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None - ) - - def tf_actions_and_internals(self, states, internals, deterministic): - assert len(internals) == 0 - - actions = dict() - for name, action in self.actions_spec.items(): - shape = (tf.shape(input=next(iter(states.values())))[0],) + action['shape'] - actions[name] = tf.fill(dims=shape, value=self.action_values[name]) - - return actions, dict() - - def tf_observe_timestep(self, states, internals, actions, terminal, reward): - return tf.no_op() diff --git a/tensorforce/models/distribution_model.py b/tensorforce/models/distribution_model.py deleted file mode 100755 index 84392494b..000000000 --- a/tensorforce/models/distribution_model.py +++ /dev/null @@ -1,272 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util -from tensorforce.core.networks import Network -from tensorforce.core.distributions import Distribution, Bernoulli, Categorical, Gaussian, Beta -from tensorforce.models import MemoryModel - - -class DistributionModel(MemoryModel): - """ - Base class for models using distributions parametrized by a neural network. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - requires_deterministic - ): - self.network_spec = network - self.distributions_spec = distributions - - # Entropy regularization - assert entropy_regularization is None or entropy_regularization >= 0.0 - self.entropy_regularization = entropy_regularization - - # For deterministic action sampling (Q vs PG model) - self.requires_deterministic = requires_deterministic - - self.network = None - self.distributions = None - self.fn_kl_divergence = None - - super(DistributionModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount - ) - - def initialize(self, custom_getter): - # Network - self.network = Network.from_spec( - spec=self.network_spec, - kwargs=dict(summary_labels=self.summary_labels) - ) - - # Before super-call since internals_spec attribute is required subsequently. - assert len(self.internals_spec) == 0 - self.internals_spec = self.network.internals_spec() - - super(DistributionModel, self).initialize(custom_getter) - - # Distributions - self.distributions = self.create_distributions() - - # KL divergence function - self.fn_kl_divergence = tf.make_template( - name_='kl-divergence', - func_=self.tf_kl_divergence, - custom_getter_=custom_getter - ) - - def create_distributions(self): - distributions = dict() - for name, action in self.actions_spec.items(): - - if self.distributions_spec is not None and name in self.distributions_spec: - kwargs = dict(action) - kwargs['summary_labels'] = self.summary_labels - distributions[name] = Distribution.from_spec( - spec=self.distributions_spec[name], - kwargs=kwargs - ) - - elif action['type'] == 'bool': - distributions[name] = Bernoulli( - shape=action['shape'], - summary_labels=self.summary_labels - ) - - elif action['type'] == 'int': - distributions[name] = Categorical( - shape=action['shape'], - num_actions=action['num_actions'], - summary_labels=self.summary_labels - ) - - elif action['type'] == 'float': - if 'min_value' in action: - distributions[name] = Beta( - shape=action['shape'], - min_value=action['min_value'], - max_value=action['max_value'], - summary_labels=self.summary_labels - ) - - else: - distributions[name] = Gaussian( - shape=action['shape'], - summary_labels=self.summary_labels - ) - - return distributions - - def tf_actions_and_internals(self, states, internals, deterministic): - embedding, internals = self.network.apply( - x=states, - internals=internals, - update=tf.constant(value=False), - return_internals=True - ) - - actions = dict() - for name, distribution in self.distributions.items(): - distr_params = distribution.parameterize(x=embedding) - actions[name] = distribution.sample( - distr_params=distr_params, - deterministic=tf.logical_or(x=deterministic, y=self.requires_deterministic) - ) - # Prefix named variable with "name_" if more than 1 distribution - if len(self.distributions.items()) > 1: - name_prefix = name + "_" - else: - name_prefix = "" - # parameterize() returns list as [logits, probabilities, state_value] - self.network.set_named_tensor(name_prefix + "logits", distr_params[0]) - self.network.set_named_tensor(name_prefix + "probabilities", distr_params[1]) - self.network.set_named_tensor(name_prefix + "state_value", distr_params[2]) - - return actions, internals - - def tf_regularization_losses(self, states, internals, update): - losses = super(DistributionModel, self).tf_regularization_losses( - states=states, - internals=internals, - update=update - ) - - network_loss = self.network.regularization_loss() - if network_loss is not None: - losses['network'] = network_loss - - for distribution in self.distributions.values(): - regularization_loss = distribution.regularization_loss() - if regularization_loss is not None: - if 'distributions' in losses: - losses['distributions'] += regularization_loss - else: - losses['distributions'] = regularization_loss - - if self.entropy_regularization is not None and self.entropy_regularization > 0.0: - entropies = list() - embedding = self.network.apply(x=states, internals=internals, update=update) - for name, distribution in self.distributions.items(): - distr_params = distribution.parameterize(x=embedding) - entropy = distribution.entropy(distr_params=distr_params) - collapsed_size = util.prod(util.shape(entropy)[1:]) - entropy = tf.reshape(tensor=entropy, shape=(-1, collapsed_size)) - entropies.append(entropy) - - entropy_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) - entropy = tf.reduce_mean(input_tensor=entropy_per_instance, axis=0) - if 'entropy' in self.summary_labels: - summary = tf.summary.scalar(name='entropy', tensor=entropy) - self.summaries.append(summary) - losses['entropy'] = -self.entropy_regularization * entropy - - return losses - - def tf_kl_divergence(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - embedding = self.network.apply(x=states, internals=internals, update=update) - kl_divergences = list() - - for name, distribution in self.distributions.items(): - distr_params = distribution.parameterize(x=embedding) - fixed_distr_params = tuple(tf.stop_gradient(input=value) for value in distr_params) - kl_divergence = distribution.kl_divergence(distr_params1=fixed_distr_params, distr_params2=distr_params) - collapsed_size = util.prod(util.shape(kl_divergence)[1:]) - kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, collapsed_size)) - kl_divergences.append(kl_divergence) - - kl_divergence_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=kl_divergences, axis=1), axis=1) - return tf.reduce_mean(input_tensor=kl_divergence_per_instance, axis=0) - - def optimizer_arguments(self, states, internals, actions, terminal, reward, next_states, next_internals): - arguments = super(DistributionModel, self).optimizer_arguments( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - arguments['fn_kl_divergence'] = self.fn_kl_divergence - return arguments - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(DistributionModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - network_variables = self.network.get_variables(include_nontrainable=include_nontrainable) - model_variables += network_variables - - distribution_variables = [ - variable for name in sorted(self.distributions) - for variable in self.distributions[name].get_variables(include_nontrainable=include_nontrainable) - ] - model_variables += distribution_variables - - return model_variables - - def get_summaries(self): - model_summaries = super(DistributionModel, self).get_summaries() - network_summaries = self.network.get_summaries() - distribution_summaries = [ - summary for name in sorted(self.distributions) - for summary in self.distributions[name].get_summaries() - ] - - return model_summaries + network_summaries + distribution_summaries diff --git a/tensorforce/models/dpg_target_model.py b/tensorforce/models/dpg_target_model.py deleted file mode 100755 index b2a8ba84e..000000000 --- a/tensorforce/models/dpg_target_model.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util, TensorForceError -from tensorforce.models import DistributionModel - -from tensorforce.core.networks import Network, LayerBasedNetwork, Dense, Linear, TFLayer, Nonlinearity -from tensorforce.core.optimizers import Optimizer, Synchronization - - -class DDPGCriticNetwork(LayerBasedNetwork): - def __init__(self, scope='ddpg-critic-network', summary_labels=(), size_t0=400, size_t1=300): - super(DDPGCriticNetwork, self).__init__(scope=scope, summary_labels=summary_labels) - - self.t0l = Linear(size=size_t0, scope='linear0') - self.t0b = TFLayer(layer='batch_normalization', scope='batchnorm0', center=True, scale=True) - self.t0n = Nonlinearity(name='relu', scope='relu0') - - self.t1l = Linear(size=size_t1, scope='linear1') - self.t1b = TFLayer(layer='batch_normalization', scope='batchnorm1', center=True, scale=True) - self.t1n = Nonlinearity(name='relu', scope='relu1') - - self.t2d = Dense(size=1, activation='tanh', scope='dense0', - weights=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) - - self.add_layer(self.t0l) - self.add_layer(self.t0b) - self.add_layer(self.t0n) - - self.add_layer(self.t1l) - self.add_layer(self.t1b) - self.add_layer(self.t1n) - - self.add_layer(self.t2d) - - def tf_apply(self, x, internals, update, return_internals=False): - assert x['states'], x['actions'] - - if isinstance(x['states'], dict): - if len(x['states']) != 1: - raise TensorForceError('DDPG critic network must have only one state input, but {} given.'.format( - len(x['states']))) - x_states = next(iter(x['states'].values())) - else: - x_states = x['states'] - - if isinstance(x['actions'], dict): - if len(x['actions']) != 1: - raise TensorForceError('DDPG critic network must have only one action input, but {} given.'.format( - len(x['actions']))) - x_actions = next(iter(x['actions'].values())) - else: - x_actions = x['actions'] - - x_actions = tf.reshape(tf.cast(x_actions, dtype=tf.float32), (-1, 1)) - - out = self.t0l.apply(x=x_states, update=update) - out = self.t0b.apply(x=out, update=update) - out = self.t0n.apply(x=out, update=update) - - out = self.t1l.apply(x=tf.concat([out, x_actions], axis=-1), update=update) - out = self.t1b.apply(x=out, update=update) - out = self.t1n.apply(x=out, update=update) - - out = self.t2d.apply(x=out, update=update) - - # Remove last dimension because we only return Q values for one state and action - out = tf.squeeze(out) - - if return_internals: - # Todo: Internals management - return out, None - else: - return out - - -class DPGTargetModel(DistributionModel): - """ - Policy gradient model log likelihood model with target network (e.g. DDPG) - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - critic_network, - critic_optimizer, - target_sync_frequency, - target_update_weight - ): - - self.critic_network_spec = critic_network - self.critic_optimizer_spec = critic_optimizer - - self.target_sync_frequency = target_sync_frequency - self.target_update_weight = target_update_weight - - # self.network is the actor, self.critic is the critic - self.target_network = None - self.target_network_optimizer = None - - self.critic = None - self.critic_optimizer = None - self.target_critic = None - self.target_critic_optimizer = None - - super(DPGTargetModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - requires_deterministic=True - ) - - assert self.memory_spec["include_next_states"] - assert self.requires_deterministic == True - - def initialize(self, custom_getter): - super(DPGTargetModel, self).initialize(custom_getter) - - # Target network - self.target_network = Network.from_spec( - spec=self.network_spec, - kwargs=dict(scope='target-network', summary_labels=self.summary_labels) - ) - - # Target network optimizer - self.target_network_optimizer = Synchronization( - sync_frequency=self.target_sync_frequency, - update_weight=self.target_update_weight - ) - - # Target network distributions - self.target_distributions = self.create_distributions() - - # Critic - size_t0 = self.critic_network_spec['size_t0'] - size_t1 = self.critic_network_spec['size_t1'] - - self.critic = DDPGCriticNetwork(scope='critic', size_t0=size_t0, size_t1=size_t1) - self.critic_optimizer = Optimizer.from_spec( - spec=self.critic_optimizer_spec, - kwargs=dict(summary_labels=self.summary_labels) - ) - - self.target_critic = DDPGCriticNetwork(scope='target-critic', size_t0=size_t0, size_t1=size_t1) - - # Target critic optimizer - self.target_critic_optimizer = Synchronization( - sync_frequency=self.target_sync_frequency, - update_weight=self.target_update_weight - ) - - self.fn_target_actions_and_internals = tf.make_template( - name_='target-actions-and-internals', - func_=self.tf_target_actions_and_internals, - custom_getter_=custom_getter - ) - - self.fn_predict_target_q = tf.make_template( - name_='predict-target-q', - func_=self.tf_predict_target_q, - custom_getter_=custom_getter - ) - - def tf_target_actions_and_internals(self, states, internals, deterministic=True): - embedding, internals = self.target_network.apply( - x=states, - internals=internals, - update=tf.constant(value=False), - return_internals=True - ) - - actions = dict() - for name, distribution in self.target_distributions.items(): - distr_params = distribution.parameterize(x=embedding) - actions[name] = distribution.sample( - distr_params=distr_params, - deterministic=tf.logical_or(x=deterministic, y=self.requires_deterministic) - ) - - return actions, internals - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - q = self.critic.apply(dict(states=states, actions=actions), internals=internals, update=update) - return -q - - def tf_predict_target_q(self, states, internals, terminal, actions, reward, update): - q_value = self.target_critic.apply(dict(states=states, actions=actions), internals=internals, update=update) - return reward + (1. - tf.cast(terminal, dtype=tf.float32)) * self.discount * q_value - - def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): - update = tf.constant(value=True) - - # Predict actions from target actor - next_target_actions, next_target_internals = self.fn_target_actions_and_internals( - states=next_states, internals=next_internals, deterministic=True - ) - - # Predicted Q value of next states - predicted_q = self.fn_predict_target_q( - states=next_states, internals=next_internals, actions=next_target_actions, terminal=terminal, - reward=reward, update=update - ) - - predicted_q = tf.stop_gradient(input=predicted_q) - - real_q = self.critic.apply(dict(states=states, actions=actions), internals=internals, update=update) - - # Update critic - def fn_critic_loss(predicted_q, real_q): - return tf.reduce_mean(tf.square(real_q - predicted_q)) - - critic_optimization = self.critic_optimizer.minimize( - time=self.timestep, - variables=self.critic.get_variables(), - arguments=dict( - predicted_q=predicted_q, - real_q=real_q - ), - fn_loss=fn_critic_loss) - - # Update actor - predicted_actions, predicted_internals = self.fn_actions_and_internals( - states=states, internals=internals, deterministic=True - ) - - optimization = super(DPGTargetModel, self).tf_optimization( - states=states, - internals=internals, - actions=predicted_actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - - # Update target actor (network) and critic - network_distributions_variables = [ - variable for name in sorted(self.distributions) - for variable in self.distributions[name].get_variables(include_nontrainable=False) - ] - - target_distributions_variables = [ - variable for name in sorted(self.target_distributions) - for variable in self.target_distributions[name].get_variables(include_nontrainable=False) - ] - - target_optimization = self.target_network_optimizer.minimize( - time=self.timestep, - variables=self.target_network.get_variables() + target_distributions_variables, - source_variables=self.network.get_variables() + network_distributions_variables - ) - - target_critic_optimization = self.target_critic_optimizer.minimize( - time=self.timestep, - variables=self.target_critic.get_variables(), - source_variables=self.critic.get_variables() - ) - - return tf.group(critic_optimization, optimization, target_optimization, target_critic_optimization) - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(DPGTargetModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - critic_variables = self.critic.get_variables(include_nontrainable=include_nontrainable) - model_variables += critic_variables - - if include_nontrainable: - critic_optimizer_variables = self.critic_optimizer.get_variables() - - for variable in critic_optimizer_variables: - if variable in model_variables: - model_variables.remove(variable) - - model_variables += critic_optimizer_variables - - if include_submodules: - target_variables = self.target_network.get_variables(include_nontrainable=include_nontrainable) - model_variables += target_variables - - target_distributions_variables = [ - variable for name in sorted(self.target_distributions) - for variable in self.target_distributions[name].get_variables(include_nontrainable=include_nontrainable) - ] - model_variables += target_distributions_variables - - target_critic_variables = self.target_critic.get_variables(include_nontrainable=include_nontrainable) - model_variables += target_critic_variables - - if include_nontrainable: - target_optimizer_variables = self.target_network_optimizer.get_variables() - model_variables += target_optimizer_variables - - target_critic_optimizer_variables = self.target_critic_optimizer.get_variables() - model_variables += target_critic_optimizer_variables - - return model_variables - - def get_summaries(self): - target_network_summaries = self.target_network.get_summaries() - target_distributions_summaries = [ - summary for name in sorted(self.target_distributions) - for summary in self.target_distributions[name].get_summaries() - ] - - # Todo: Critic summaries - return super(DPGTargetModel, self).get_summaries() + target_network_summaries \ - + target_distributions_summaries diff --git a/tensorforce/models/memory_model.py b/tensorforce/models/memory_model.py deleted file mode 100755 index 6a51e8aea..000000000 --- a/tensorforce/models/memory_model.py +++ /dev/null @@ -1,625 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util, TensorForceError -from tensorforce.core.memories import Memory -from tensorforce.core.optimizers import Optimizer -from tensorforce.models import Model - - -class MemoryModel(Model): - """ - A memory model is a generical model to accumulate and sample data. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount - ): - """ - Memory model. - - Args: - states (spec): The state-space description dictionary. - actions (spec): The action-space description dictionary. - scope (str): The root scope str to use for tf variable scoping. - device (str): The name of the device to run the graph of this model on. - saver (spec): Dict specifying whether and how to save the model's parameters. - summarizer (spec): Dict specifying which tensorboard summaries should be created and added to the graph. - distributed (spec): Dict specifying whether and how to do distributed training on the model's graph. - batching_capacity (int): Batching capacity. - variable_noise (float): The stddev value of a Normal distribution used for adding random - noise to the model's output (for each batch, noise can be toggled and - if active - will be resampled). - Use None for not adding any noise. - states_preprocessing (spec / dict of specs): Dict specifying whether and how to preprocess state signals - (e.g. normalization, greyscale, etc..). - actions_exploration (spec / dict of specs): Dict specifying whether and how to add exploration to the model's - "action outputs" (e.g. epsilon-greedy). - reward_preprocessing (spec): Dict specifying whether and how to preprocess rewards coming - from the Environment (e.g. reward normalization). - update_mode (spec): Update mode. - memory (spec): Memory. - optimizer (spec): Dict specifying the tf optimizer to use for tuning the model's trainable parameters. - discount (float): The RL reward discount factor (gamma). - """ - self.update_mode = update_mode - self.memory_spec = memory - self.optimizer_spec = optimizer - - # Discount - assert discount is None or discount >= 0.0 - self.discount = discount - - self.memory = None - self.optimizer = None - self.fn_discounted_cumulative_reward = None - self.fn_loss_per_instance = None - self.fn_regularization_losses = None - self.fn_loss = None - self.fn_optimization = None - - super(MemoryModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing - ) - - def as_local_model(self): - super(MemoryModel, self).as_local_model() - self.optimizer_spec = dict( - type='global_optimizer', - optimizer=self.optimizer_spec - ) - - def initialize(self, custom_getter): - super(MemoryModel, self).initialize(custom_getter) - - # Memory - self.memory = Memory.from_spec( - spec=self.memory_spec, - kwargs=dict( - states=self.states_spec, - internals=self.internals_spec, - actions=self.actions_spec, - summary_labels=self.summary_labels - ) - ) - - # Optimizer - self.optimizer = Optimizer.from_spec( - spec=self.optimizer_spec, - kwargs=dict(summary_labels=self.summary_labels) - ) - - # TensorFlow functions - self.fn_discounted_cumulative_reward = tf.make_template( - name_='discounted-cumulative-reward', - func_=self.tf_discounted_cumulative_reward, - custom_getter_=custom_getter - ) - self.fn_reference = tf.make_template( - name_='reference', - func_=self.tf_reference, - custom_getter_=custom_getter - ) - self.fn_loss_per_instance = tf.make_template( - name_='loss-per-instance', - func_=self.tf_loss_per_instance, - custom_getter_=custom_getter - ) - self.fn_regularization_losses = tf.make_template( - name_='regularization-losses', - func_=self.tf_regularization_losses, - custom_getter_=custom_getter - ) - self.fn_loss = tf.make_template( - name_='loss', - func_=self.tf_loss, - custom_getter_=custom_getter - ) - self.fn_optimization = tf.make_template( - name_='optimization', - func_=self.tf_optimization, - custom_getter_=custom_getter - ) - self.fn_import_experience = tf.make_template( - name_='import-experience', - func_=self.tf_import_experience, - custom_getter_=custom_getter - ) - - def tf_initialize(self): - super(MemoryModel, self).tf_initialize() - self.memory.initialize() - - def tf_discounted_cumulative_reward(self, terminal, reward, discount, final_reward=0.0): - """ - Creates the TensorFlow operations for calculating the discounted cumulative rewards - for a given sequence of rewards. - - Args: - terminal: Terminal boolean tensor. - reward: Reward tensor. - discount: Discount factor. - final_reward: Last reward value in the sequence. - - Returns: - Discounted cumulative reward tensor. - """ - - # TODO: n-step cumulative reward (particularly for envs without terminal) - - def cumulate(cumulative, reward_and_terminal): - rew, term = reward_and_terminal - return tf.where(condition=term, x=rew, y=(rew + cumulative * discount)) - - # Reverse since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right - reward = tf.reverse(tensor=reward, axis=(0,)) - terminal = tf.reverse(tensor=terminal, axis=(0,)) - - reward = tf.scan(fn=cumulate, elems=(reward, terminal), initializer=tf.stop_gradient(input=final_reward)) - - return tf.reverse(tensor=reward, axis=(0,)) - - # # TODO: this could be a utility helper function if we remove self.discount and only allow external discount-value input - # def tf_discounted_cumulative_reward(self, terminal, reward, discount=None, final_reward=0.0, horizon=0): - # """ - # Creates and returns the TensorFlow operations for calculating the sequence of discounted cumulative rewards - # for a given sequence of single rewards. - - # Example: - # single rewards = 2.0 1.0 0.0 0.5 1.0 -1.0 - # terminal = False, False, False, False True False - # gamma = 0.95 - # final_reward = 100.0 (only matters for last episode (r=-1.0) as this episode has no terminal signal) - # horizon=3 - # output = 2.95 1.45 1.38 1.45 1.0 94.0 - - # Args: - # terminal: Tensor (bool) holding the is-terminal sequence. This sequence may contain more than one - # True value. If its very last element is False (not terminating), the given `final_reward` value - # is assumed to follow the last value in the single rewards sequence (see below). - # reward: Tensor (float) holding the sequence of single rewards. If the last element of `terminal` is False, - # an assumed last reward of the value of `final_reward` will be used. - # discount (float): The discount factor (gamma). By default, take the Model's discount factor. - # final_reward (float): Reward value to use if last episode in sequence does not terminate (terminal sequence - # ends with False). This value will be ignored if horizon == 1 or discount == 0.0. - # horizon (int): The length of the horizon (e.g. for n-step cumulative rewards in continuous tasks - # without terminal signals). Use 0 (default) for an infinite horizon. Note that horizon=1 leads to the - # exact same results as a discount factor of 0.0. - - # Returns: - # Discounted cumulative reward tensor with the same shape as `reward`. - # """ - - # # By default -> take Model's gamma value - # if discount is None: - # discount = self.discount - - # # Accumulates discounted (n-step) reward (start new if terminal) - # def cumulate(cumulative, reward_terminal_horizon_subtract): - # rew, is_terminal, is_over_horizon, sub = reward_terminal_horizon_subtract - # return tf.where( - # # If terminal, start new cumulation. - # condition=is_terminal, - # x=rew, - # y=tf.where( - # # If we are above the horizon length (H) -> subtract discounted value from H steps back. - # condition=is_over_horizon, - # x=(rew + cumulative * discount - sub), - # y=(rew + cumulative * discount) - # ) - # ) - - # # Accumulates length of episodes (starts new if terminal) - # def len_(cumulative, term): - # return tf.where( - # condition=term, - # # Start counting from 1 after is-terminal signal - # x=tf.ones(shape=(), dtype=tf.int32), - # # Otherwise, increase length by 1 - # y=cumulative + 1 - # ) - - # # Reverse, since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right. - # reward = tf.reverse(tensor=reward, axis=(0,)) - # # e.g. -1.0 1.0 0.5 0.0 1.0 2.0 - # terminal = tf.reverse(tensor=terminal, axis=(0,)) - # # e.g. F T F F F F - - # # Store the steps until end of the episode(s) determined by the input terminal signals (True starts new count). - # lengths = tf.scan(fn=len_, elems=terminal, initializer=0) - # # e.g. 1 1 2 3 4 5 - # off_horizon = tf.greater(lengths, tf.fill(dims=tf.shape(lengths), value=horizon)) - # # e.g. F F F F T T - - # # Calculate the horizon-subtraction value for each step. - # if horizon > 0: - # horizon_subtractions = tf.map_fn(lambda x: (discount ** horizon) * x, reward, dtype=tf.float32) - # # Shift right by size of horizon (fill rest with 0.0). - # horizon_subtractions = tf.concat([np.zeros(shape=(horizon,)), horizon_subtractions], axis=0) - # horizon_subtractions = tf.slice(horizon_subtractions, begin=(0,), size=tf.shape(reward)) - # # e.g. 0.0, 0.0, 0.0, -1.0*g^3, 1.0*g^3, 0.5*g^3 - # # all 0.0 if infinite horizon (special case: horizon=0) - # else: - # horizon_subtractions = tf.zeros(shape=tf.shape(reward)) - - # # Now do the scan, each time summing up the previous step (discounted by gamma) and - # # subtracting the respective `horizon_subtraction`. - # reward = tf.scan( - # fn=cumulate, - # elems=(reward, terminal, off_horizon, horizon_subtractions), - # initializer=final_reward if horizon != 1 else 0.0 - # ) - # # Re-reverse again to match input sequences. - # return tf.reverse(tensor=reward, axis=(0,)) - - def tf_reference(self, states, internals, actions, terminal, reward, next_states, next_internals, update): - """ - Creates the TensorFlow operations for obtaining the reference tensor(s), in case of a - comparative loss. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - next_states: Dict of successor state tensors. - next_internals: List of posterior internal state tensors. - update: Boolean tensor indicating whether this call happens during an update. - - Returns: - Reference tensor(s). - """ - return None - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - """ - Creates the TensorFlow operations for calculating the loss per batch instance. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - next_states: Dict of successor state tensors. - next_internals: List of posterior internal state tensors. - update: Boolean tensor indicating whether this call happens during an update. - reference: Optional reference tensor(s), in case of a comparative loss. - - Returns: - Loss per instance tensor. - """ - raise NotImplementedError - - def tf_regularization_losses(self, states, internals, update): - """ - Creates the TensorFlow operations for calculating the regularization losses for the given input states. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - update: Boolean tensor indicating whether this call happens during an update. - - Returns: - Dict of regularization loss tensors. - """ - return dict() - - def tf_loss(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - """ - Creates the TensorFlow operations for calculating the full loss of a batch. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - next_states: Dict of successor state tensors. - next_internals: List of posterior internal state tensors. - update: Boolean tensor indicating whether this call happens during an update. - reference: Optional reference tensor(s), in case of a comparative loss. - - Returns: - Loss tensor. - """ - # Mean loss per instance - loss_per_instance = self.fn_loss_per_instance( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals, - update=update, - reference=reference - ) - - self.memory.update_batch(loss_per_instance=loss_per_instance) - - loss = tf.reduce_mean(input_tensor=loss_per_instance, axis=0) - - # Loss without regularization summary - if 'losses' in self.summary_labels: - summary = tf.summary.scalar(name='loss-without-regularization', tensor=loss) - self.summaries.append(summary) - - # Regularization losses - losses = self.fn_regularization_losses(states=states, internals=internals, update=update) - if len(losses) > 0: - loss += tf.add_n(inputs=list(losses.values())) - if 'regularization' in self.summary_labels: - for name, loss_val in losses.items(): - summary = tf.summary.scalar(name=('regularization/' + name), tensor=loss_val) - self.summaries.append(summary) - - # Total loss summary - if 'losses' in self.summary_labels or 'total-loss' in self.summary_labels: - summary = tf.summary.scalar(name='total-loss', tensor=loss) - self.summaries.append(summary) - - return loss - - def optimizer_arguments(self, states, internals, actions, terminal, reward, next_states, next_internals): - """ - Returns the optimizer arguments including the time, the list of variables to optimize, - and various functions which the optimizer might require to perform an update step. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - next_states: Dict of successor state tensors. - next_internals: List of posterior internal state tensors. - - Returns: - Optimizer arguments as dict. - """ - arguments = dict( - time=self.global_timestep, - variables=self.get_variables(), - arguments=dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals, - update=tf.constant(value=True) - ), - fn_reference=self.fn_reference, - fn_loss=self.fn_loss - ) - if self.global_model is not None: - arguments['global_variables'] = self.global_model.get_variables() - return arguments - - def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): - """ - Creates the TensorFlow operations for performing an optimization update step based - on the given input states and actions batch. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - next_states: Dict of successor state tensors. - next_internals: List of posterior internal state tensors. - - Returns: - The optimization operation. - """ - arguments = self.optimizer_arguments( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - return self.optimizer.minimize(**arguments) - - def tf_observe_timestep(self, states, internals, actions, terminal, reward): - # Store timestep in memory - stored = self.memory.store( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - # Periodic optimization - with tf.control_dependencies(control_inputs=(stored,)): - unit = self.update_mode['unit'] - batch_size = self.update_mode['batch_size'] - frequency = self.update_mode.get('frequency', batch_size) - first_update = self.update_mode.get('first_update', 0) - - if unit == 'timesteps': - # Timestep-based batch - optimize = tf.logical_and( - x=tf.equal(x=(self.timestep % frequency), y=0), - y=tf.logical_and( - x=tf.greater_equal(x=self.timestep, y=batch_size), - y=tf.greater_equal(x=self.timestep, y=first_update) - ) - ) - batch = self.memory.retrieve_timesteps(n=batch_size) - - elif unit == 'episodes': - # Episode-based batch - optimize = tf.logical_and( - x=tf.equal(x=(self.episode % frequency), y=0), - y=tf.logical_and( - # Only update once per episode increment. - x=tf.greater(x=tf.count_nonzero(input_tensor=terminal), y=0), - y=tf.logical_and( - x=tf.greater_equal(x=self.episode, y=batch_size), - y=tf.greater_equal(x=self.episode, y=first_update) - ) - ) - ) - batch = self.memory.retrieve_episodes(n=batch_size) - - elif unit == 'sequences': - # Timestep-sequence-based batch - sequence_length = self.update_mode.get('length', 8) - optimize = tf.logical_and( - x=tf.equal(x=(self.timestep % frequency), y=0), - y=tf.logical_and( - x=tf.greater_equal(x=self.timestep, y=(batch_size + sequence_length - 1)), - y=tf.greater_equal(x=self.timestep, y=first_update) - ) - ) - batch = self.memory.retrieve_sequences(n=batch_size, sequence_length=sequence_length) - - else: - raise TensorForceError("Invalid update unit: {}.".format(unit)) - - # Do not calculate gradients for memory-internal operations. - batch = util.map_tensors( - fn=(lambda tensor: tf.stop_gradient(input=tensor)), - tensors=batch - ) - - optimization = tf.cond( - pred=optimize, - true_fn=(lambda: self.fn_optimization(**batch)), - false_fn=tf.no_op - ) - - return optimization - - def tf_import_experience(self, states, internals, actions, terminal, reward): - """ - Imports experiences into the TensorFlow memory structure. Can be used to import - off-policy data. - - :param states: Dict of state values to import with keys as state names and values as values to set. - :param internals: Internal values to set, can be fetched from agent via agent.current_internals - if no values available. - :param actions: Dict of action values to import with keys as action names and values as values to set. - :param terminal: Terminal value(s) - :param reward: Reward value(s) - """ - return self.memory.store( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - def create_operations(self, states, internals, actions, terminal, reward, deterministic, independent): - # Import experience operation. - self.import_experience_output = self.fn_import_experience( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - super(MemoryModel, self).create_operations( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - deterministic=deterministic, - independent=independent - ) - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(MemoryModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - if include_nontrainable: - memory_variables = self.memory.get_variables() - model_variables += memory_variables - - optimizer_variables = self.optimizer.get_variables() - # For some reason, some optimizer variables are only registered in the model. - for variable in optimizer_variables: - if variable in model_variables: - model_variables.remove(variable) - model_variables += optimizer_variables - - return model_variables - - def get_summaries(self): - model_summaries = super(MemoryModel, self).get_summaries() - memory_summaries = self.memory.get_summaries() - optimizer_summaries = self.optimizer.get_summaries() - return model_summaries + memory_summaries + optimizer_summaries - - def import_experience(self, states, internals, actions, terminal, reward): - """ - Stores experiences. - """ - fetches = self.import_experience_output - - feed_dict = self.get_feed_dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) diff --git a/tensorforce/models/model.py b/tensorforce/models/model.py deleted file mode 100755 index 5fa9240fa..000000000 --- a/tensorforce/models/model.py +++ /dev/null @@ -1,1297 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -""" -The `Model` class coordinates the creation and execution of all TensorFlow operations within a model. -It implements the `reset`, `act` and `update` functions, which form the interface the `Agent` class -communicates with, and which should not need to be overwritten. Instead, the following TensorFlow -functions need to be implemented: - -* `tf_actions_and_internals(states, internals, deterministic)` returning the batch of - actions and successor internal states. -* `tf_loss_per_instance(states, internals, actions, terminal, reward)` returning the loss - per instance for a batch. - -Further, the following TensorFlow functions should be extended accordingly: - -* `initialize(custom_getter)` defining TensorFlow placeholders/functions and adding internal states. -* `get_variables()` returning the list of TensorFlow variables (to be optimized) of this model. -* `tf_regularization_losses(states, internals)` returning a dict of regularization losses. -* `get_optimizer_kwargs(states, internals, actions, terminal, reward)` returning a dict of potential - arguments (argument-free functions) to the optimizer. - -Finally, the following TensorFlow functions can be useful in some cases: - -* `preprocess_states(states)` for state preprocessing, returning the processed batch of states. -* `tf_action_exploration(action, exploration, action_spec)` for action postprocessing (e.g. exploration), - returning the processed batch of actions. -* `tf_preprocess_reward(states, internals, terminal, reward)` for reward preprocessing (e.g. reward normalization), - returning the processed batch of rewards. -* `create_output_operations(states, internals, actions, terminal, reward, deterministic)` for further output operations, - similar to the two above for `Model.act` and `Model.update`. -* `tf_optimization(states, internals, actions, terminal, reward)` for further optimization operations - (e.g. the baseline update in a `PGModel` or the target network update in a `QModel`), - returning a single grouped optimization operation. -""" - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from copy import deepcopy -import os - -import numpy as np -import tensorflow as tf - -from tensorforce import TensorForceError, util -from tensorforce.core.explorations import Exploration -from tensorforce.core.preprocessors import PreprocessorStack - - -class Model(object): - """ - Base class for all (TensorFlow-based) models. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing - ): - """ - Model. - - Args: - states (spec): The state-space description dictionary. - actions (spec): The action-space description dictionary. - scope (str): The root scope str to use for tf variable scoping. - device (str): The name of the device to run the graph of this model on. - saver (spec): Dict specifying whether and how to save the model's parameters. - summarizer (spec): Dict specifying which tensorboard summaries should be created and added to the graph. - distributed (spec): Dict specifying whether and how to do distributed training on the model's graph. - batching_capacity (int): Batching capacity. - variable_noise (float): The stddev value of a Normal distribution used for adding random - noise to the model's output (for each batch, noise can be toggled and - if active - will be resampled). - Use None for not adding any noise. - states_preprocessing (spec / dict of specs): Dict specifying whether and how to preprocess state signals - (e.g. normalization, greyscale, etc..). - actions_exploration (spec / dict of specs): Dict specifying whether and how to add exploration to the model's - "action outputs" (e.g. epsilon-greedy). - reward_preprocessing (spec): Dict specifying whether and how to preprocess rewards coming - from the Environment (e.g. reward normalization). - """ - # Network crated from network_spec in distribution_model.py - # Needed for named_tensor access - self.network = None - - # States/internals/actions specifications - self.states_spec = states - self.internals_spec = dict() - self.actions_spec = actions - - # TensorFlow scope, device - self.scope = scope - self.device = device - - # Saver/summaries/distributes - if saver is None or saver.get('directory') is None: - self.saver_spec = None - else: - self.saver_spec = saver - if summarizer is None or summarizer.get('directory') is None: - self.summarizer_spec = None - else: - self.summarizer_spec = summarizer - - self.distributed_spec = distributed - - # TensorFlow summaries - if self.summarizer_spec is None: - self.summary_labels = set() - else: - self.summary_labels = set(self.summarizer_spec.get('labels', ())) - - # Batching capacity for act/observe interface - assert batching_capacity is None or (isinstance(batching_capacity, int) and batching_capacity > 0) - self.batching_capacity = batching_capacity - - # Variable noise - assert variable_noise is None or variable_noise > 0.0 - self.variable_noise = variable_noise - - # Preprocessing and exploration - self.states_preprocessing_spec = states_preprocessing - self.actions_exploration_spec = actions_exploration - self.reward_preprocessing_spec = reward_preprocessing - - self.is_observe = False - - self.states_preprocessing = None - self.actions_exploration = None - self.reward_preprocessing = None - - self.variables = None - self.all_variables = None - self.registered_variables = None - self.summaries = None - - self.timestep = None - self.episode = None - self.global_timestep = None - self.global_episode = None - - self.states_input = None - self.internals_input = None - self.actions_input = None - self.terminal_input = None - self.reward_input = None - self.deterministic_input = None - self.independent_input = None - self.update_input = None - self.internals_init = None - - self.fn_initialize = None - self.fn_actions_and_internals = None - self.fn_observe_timestep = None - self.fn_action_exploration = None - - self.graph = None - self.global_model = None - self.scaffold = None - self.saver_directory = None - self.session = None - self.monitored_session = None - self.summary_writer = None - self.summary_writer_hook = None - - self.increment_episode = None - - self.actions_output = None - self.internals_output = None - self.timestep_output = None - - self.summary_configuration_op = None - - # Setup TensorFlow graph and session - self.setup() - - def setup(self): - """ - Sets up the TensorFlow model graph and initializes (and enters) the TensorFlow session. - """ - - # Create our Graph or figure out, which shared/global one to use. - default_graph = None - # No parallel RL or ThreadedRunner with Hogwild! shared network updates: - # Build single graph and work with that from here on. In the case of threaded RL, the central - # and already initialized model is handed to the worker Agents via the ThreadedRunner's - # WorkerAgentGenerator factory. - if self.distributed_spec is None: - self.graph = tf.Graph() - default_graph = self.graph.as_default() - default_graph.__enter__() - self.global_model = None - # Distributed tensorflow setup (each process gets its own (identical) graph). - # We are the parameter server. - elif self.distributed_spec.get('parameter_server'): - if self.distributed_spec.get('replica_model'): - raise TensorForceError("Invalid config value for distributed mode.") - self.graph = tf.Graph() - default_graph = self.graph.as_default() - default_graph.__enter__() - self.global_model = None - self.scope = self.scope + '-ps' - # We are a worker's replica model. - # Place our ops round-robin on all worker devices. - elif self.distributed_spec.get('replica_model'): - self.graph = tf.get_default_graph() - self.global_model = None - # The graph is the parent model's graph, hence no new graph here. - self.device = tf.train.replica_device_setter( - worker_device=self.device, - cluster=self.distributed_spec['cluster_spec'] - ) - self.scope = self.scope + '-ps' - # We are a worker: - # Construct the global model (deepcopy of ourselves), set it up via `setup` and link to it (global_model). - else: - graph = tf.Graph() - default_graph = graph.as_default() - default_graph.__enter__() - self.global_model = deepcopy(self) - self.global_model.distributed_spec['replica_model'] = True - self.global_model.setup() - self.graph = graph - self.as_local_model() - self.scope = self.scope + '-worker' + str(self.distributed_spec['task_index']) - - with tf.device(device_name_or_function=self.device): - with tf.variable_scope(name_or_scope=self.scope, reuse=False): - - # Variables and summaries - self.variables = dict() - self.all_variables = dict() - self.registered_variables = set() - self.summaries = list() - - def custom_getter(getter, name, registered=False, **kwargs): - if registered: - self.registered_variables.add(name) - elif name in self.registered_variables: - registered = True - # Top-level, hence no 'registered' argument. - variable = getter(name=name, **kwargs) - if not registered: - self.all_variables[name] = variable - if kwargs.get('trainable', True): - self.variables[name] = variable - if 'variables' in self.summary_labels: - summary = tf.summary.histogram(name=name, values=variable) - self.summaries.append(summary) - return variable - - # Global timestep - collection = self.graph.get_collection(name='global-timestep') - if len(collection) == 0: - self.global_timestep = tf.Variable( - name='global-timestep', - dtype=util.tf_dtype('int'), - trainable=False, - initial_value=0 - ) - self.graph.add_to_collection(name='global-timestep', value=self.global_timestep) - self.graph.add_to_collection(name=tf.GraphKeys.GLOBAL_STEP, value=self.global_timestep) - else: - assert len(collection) == 1 - self.global_timestep = collection[0] - - # Global episode - collection = self.graph.get_collection(name='global-episode') - if len(collection) == 0: - self.global_episode = tf.Variable( - name='global-episode', - dtype=util.tf_dtype('int'), - trainable=False, - initial_value=0 - ) - self.graph.add_to_collection(name='global-episode', value=self.global_episode) - else: - assert len(collection) == 1 - self.global_episode = collection[0] - - # Create placeholders, tf functions, internals, etc - self.initialize(custom_getter=custom_getter) - - # self.fn_actions_and_internals( - # states=states, - # internals=internals, - # update=update, - # deterministic=deterministic - # ) - # self.fn_loss_per_instance( - # states=states, - # internals=internals, - # actions=actions, - # terminal=terminal, - # reward=reward, - # update=update - # ) - self.fn_initialize() - - # Input tensors - states = util.map_tensors(fn=tf.identity, tensors=self.states_input) - internals = util.map_tensors(fn=tf.identity, tensors=self.internals_input) - actions = util.map_tensors(fn=tf.identity, tensors=self.actions_input) - terminal = tf.identity(input=self.terminal_input) - reward = tf.identity(input=self.reward_input) - # Probably both deterministic and independent should be the same at some point. - deterministic = tf.identity(input=self.deterministic_input) - independent = tf.identity(input=self.independent_input) - - states, actions, reward = self.fn_preprocess(states=states, actions=actions, reward=reward) - - self.create_operations( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - deterministic=deterministic, - independent=independent - ) - - # Add all summaries specified in summary_labels - if any(k in self.summary_labels for k in ['inputs', 'states']): - for name, state in states.items(): - summary = tf.summary.histogram(name=(self.scope + '/inputs/states/' + name), values=state) - self.summaries.append(summary) - if any(k in self.summary_labels for k in ['inputs', 'actions']): - for name, action in actions.items(): - summary = tf.summary.histogram(name=(self.scope + '/inputs/actions/' + name), values=action) - self.summaries.append(summary) - if any(k in self.summary_labels for k in ['inputs', 'rewards']): - summary = tf.summary.histogram(name=(self.scope + '/inputs/rewards'), values=reward) - self.summaries.append(summary) - - if self.distributed_spec is None: - global_variables = self.get_variables(include_submodules=True, include_nontrainable=True) - global_variables += [self.global_episode, self.global_timestep] - init_op = tf.variables_initializer(var_list=global_variables) - ready_op = tf.report_uninitialized_variables(var_list=global_variables) - ready_for_local_init_op = None - local_init_op = None - - else: - # We are just a replica model: Return. - if self.distributed_spec.get('replica_model'): - return - # We are the parameter server: Start and wait. - elif self.distributed_spec.get('parameter_server'): - server = tf.train.Server( - server_or_cluster_def=self.distributed_spec['cluster_spec'], - job_name='ps', - task_index=self.distributed_spec['task_index'], - protocol=self.distributed_spec.get('protocol'), - config=None, - start=True - ) - # Param server does nothing actively. - server.join() - return - - # Global and local variable initializers. - global_variables = self.global_model.get_variables( - include_submodules=True, - include_nontrainable=True - ) - global_variables += [self.global_episode, self.global_timestep] - local_variables = self.get_variables(include_submodules=True, include_nontrainable=True) - init_op = tf.variables_initializer(var_list=global_variables) - ready_op = tf.report_uninitialized_variables(var_list=(global_variables + local_variables)) - ready_for_local_init_op = tf.report_uninitialized_variables(var_list=global_variables) - local_init_op = tf.group( - tf.variables_initializer(var_list=local_variables), - # Synchronize values of trainable variables. - *(tf.assign(ref=local_var, value=global_var) for local_var, global_var in zip( - self.get_variables(include_submodules=True), - self.global_model.get_variables(include_submodules=True) - )) - ) - - def init_fn(scaffold, session): - if self.saver_spec is not None and self.saver_spec.get('load', True): - directory = self.saver_spec['directory'] - file = self.saver_spec.get('file') - if file is None: - file = tf.train.latest_checkpoint( - checkpoint_dir=directory, - latest_filename=None # Corresponds to argument of saver.save() in Model.save(). - ) - elif not os.path.isfile(file): - file = os.path.join(directory, file) - if file is not None: - scaffold.saver.restore(sess=session, save_path=file) - - # Summary operation - summaries = self.get_summaries() - if len(summaries) > 0: - summary_op = tf.summary.merge(inputs=summaries) - else: - summary_op = None - - # TensorFlow saver object - self.saver = tf.train.Saver( - var_list=global_variables, # should be given? - reshape=False, - sharded=False, # should be true? - max_to_keep=5, - keep_checkpoint_every_n_hours=10000.0, - name=None, - restore_sequentially=False, - saver_def=None, - builder=None, - defer_build=False, - allow_empty=True, - write_version=tf.train.SaverDef.V2, - pad_step_number=False, - save_relative_paths=True - # filename=None - ) - - # TensorFlow scaffold object - self.scaffold = tf.train.Scaffold( - init_op=init_op, - init_feed_dict=None, - init_fn=init_fn, - ready_op=ready_op, - ready_for_local_init_op=ready_for_local_init_op, - local_init_op=local_init_op, - summary_op=summary_op, - saver=self.saver, - copy_from_scaffold=None - ) - - hooks = list() - - # Checkpoint saver hook - if self.saver_spec is not None and (self.distributed_spec is None or self.distributed_spec['task_index'] == 0): - self.saver_directory = self.saver_spec['directory'] - hooks.append(tf.train.CheckpointSaverHook( - checkpoint_dir=self.saver_directory, - save_secs=self.saver_spec.get('seconds', None if 'steps' in self.saver_spec else 600), - save_steps=self.saver_spec.get('steps'), # Either one or the other has to be set. - saver=None, # None since given via 'scaffold' argument. - checkpoint_basename=self.saver_spec.get('basename', 'model.ckpt'), - scaffold=self.scaffold, - listeners=None - )) - else: - self.saver_directory = None - - # Summary saver hook - if self.summarizer_spec is None: - self.summarizer_hook = None - else: - # TensorFlow summary writer object - self.summarizer = tf.summary.FileWriter( - logdir=self.summarizer_spec['directory'], - graph=self.graph, - max_queue=10, - flush_secs=120, - filename_suffix=None - ) - self.summarizer_hook = util.UpdateSummarySaverHook( - model=self, - save_steps=self.summarizer_spec.get('steps'), # Either one or the other has to be set. - save_secs=self.summarizer_spec.get('seconds', None if 'steps' in self.summarizer_spec else 120), - output_dir=None, # None since given via 'summary_writer' argument. - summary_writer=self.summarizer, - scaffold=self.scaffold, - summary_op=None # None since given via 'scaffold' argument. - ) - hooks.append(self.summarizer_hook) - - # Stop at step hook - # hooks.append(tf.train.StopAtStepHook( - # num_steps=???, # This makes more sense, if load and continue training. - # last_step=None # Either one or the other has to be set. - # )) - - # # Step counter hook - # hooks.append(tf.train.StepCounterHook( - # every_n_steps=counter_config.get('steps', 100), # Either one or the other has to be set. - # every_n_secs=counter_config.get('secs'), # Either one or the other has to be set. - # output_dir=None, # None since given via 'summary_writer' argument. - # summary_writer=summary_writer - # )) - - # Other available hooks: - # tf.train.FinalOpsHook(final_ops, final_ops_feed_dict=None) - # tf.train.GlobalStepWaiterHook(wait_until_step) - # tf.train.LoggingTensorHook(tensors, every_n_iter=None, every_n_secs=None) - # tf.train.NanTensorHook(loss_tensor, fail_on_nan_loss=True) - # tf.train.ProfilerHook(save_steps=None, save_secs=None, output_dir='', show_dataflow=True, show_memory=False) - - if self.distributed_spec is None: - # TensorFlow non-distributed monitored session object - self.monitored_session = tf.train.SingularMonitoredSession( - hooks=hooks, - scaffold=self.scaffold, - master='', # Default value. - config=None, # self.distributed_spec.get('session_config'), - checkpoint_dir=None - ) - - else: - server = tf.train.Server( - server_or_cluster_def=self.distributed_spec['cluster_spec'], - job_name='worker', - task_index=self.distributed_spec['task_index'], - protocol=self.distributed_spec.get('protocol'), - config=self.distributed_spec.get('session_config'), - start=True - ) - - # if self.distributed_spec['task_index'] == 0: - # TensorFlow chief session creator object - session_creator = tf.train.ChiefSessionCreator( - scaffold=self.scaffold, - master=server.target, - config=self.distributed_spec.get('session_config'), - checkpoint_dir=None, - checkpoint_filename_with_path=None - ) - # else: - # # TensorFlow worker session creator object - # session_creator = tf.train.WorkerSessionCreator( - # scaffold=self.scaffold, - # master=server.target, - # config=self.distributed_spec.get('session_config'), - # ) - - # TensorFlow monitored session object - self.monitored_session = tf.train.MonitoredSession( - session_creator=session_creator, - hooks=hooks, - stop_grace_period_secs=120 # Default value. - ) - - if default_graph: - default_graph.__exit__(None, None, None) - self.graph.finalize() - self.monitored_session.__enter__() - self.session = self.monitored_session._tf_sess() - - def close(self): - if self.saver_directory is not None: - self.save(append_timestep=True) - self.monitored_session.close() - - def as_local_model(self): - pass - - def initialize(self, custom_getter): - """ - Creates the TensorFlow placeholders and functions for this model. Moreover adds the - internal state placeholders and initialization values to the model. - - Args: - custom_getter: The `custom_getter_` object to use for `tf.make_template` when creating TensorFlow functions. - """ - - # States - self.states_input = dict() - for name, state in self.states_spec.items(): - self.states_input[name] = tf.placeholder( - dtype=util.tf_dtype(state['type']), - shape=(None,) + tuple(state['shape']), - name=('state-' + name) - ) - - # States preprocessing - self.states_preprocessing = dict() - - if self.states_preprocessing_spec is None: - for name, state in self.states_spec.items(): - state['unprocessed_shape'] = state['shape'] - elif not isinstance(self.states_preprocessing_spec, list) and \ - all(name in self.states_spec for name in self.states_preprocessing_spec): - for name, state in self.states_spec.items(): - if name in self.states_preprocessing_spec: - preprocessing = PreprocessorStack.from_spec( - spec=self.states_preprocessing_spec[name], - kwargs=dict(shape=state['shape']) - ) - state['unprocessed_shape'] = state['shape'] - state['shape'] = preprocessing.processed_shape(shape=state['unprocessed_shape']) - self.states_preprocessing[name] = preprocessing - else: - state['unprocessed_shape'] = state['shape'] - # single preprocessor for all components of our state space - elif "type" in self.states_preprocessing_spec: - preprocessing = PreprocessorStack.from_spec(spec=self.states_preprocessing_spec) - for name, state in self.states_spec.items(): - state['unprocessed_shape'] = state['shape'] - state['shape'] = preprocessing.processed_shape(shape=state['unprocessed_shape']) - self.states_preprocessing[name] = preprocessing - else: - for name, state in self.states_spec.items(): - preprocessing = PreprocessorStack.from_spec( - spec=self.states_preprocessing_spec, - kwargs=dict(shape=state['shape']) - ) - state['unprocessed_shape'] = state['shape'] - state['shape'] = preprocessing.processed_shape(shape=state['unprocessed_shape']) - self.states_preprocessing[name] = preprocessing - - # Internals - self.internals_input = dict() - self.internals_init = dict() - for name, internal in self.internals_spec.items(): - self.internals_input[name] = tf.placeholder( - dtype=util.tf_dtype(internal['type']), - shape=(None,) + tuple(internal['shape']), - name=('internal-' + name) - ) - if internal['initialization'] == 'zeros': - self.internals_init[name] = np.zeros(shape=internal['shape']) - else: - raise TensorForceError("Invalid internal initialization value.") - - # Actions - self.actions_input = dict() - for name, action in self.actions_spec.items(): - self.actions_input[name] = tf.placeholder( - dtype=util.tf_dtype(action['type']), - shape=(None,) + tuple(action['shape']), - name=('action-' + name) - ) - - # Actions exploration - self.actions_exploration = dict() - if self.actions_exploration_spec is None: - pass - elif all(name in self.actions_spec for name in self.actions_exploration_spec): - for name, action in self.actions_spec.items(): - if name in self.actions_exploration: - self.actions_exploration[name] = Exploration.from_spec(spec=self.actions_exploration_spec[name]) - else: - for name, action in self.actions_spec.items(): - self.actions_exploration[name] = Exploration.from_spec(spec=self.actions_exploration_spec) - - # Terminal - self.terminal_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(None,), name='terminal') - - # Reward - self.reward_input = tf.placeholder(dtype=util.tf_dtype('float'), shape=(None,), name='reward') - - # Reward preprocessing - if self.reward_preprocessing_spec is None: - self.reward_preprocessing = None - else: - self.reward_preprocessing = PreprocessorStack.from_spec( - spec=self.reward_preprocessing_spec, - # TODO this can eventually have more complex shapes? - kwargs=dict(shape=()) - ) - if self.reward_preprocessing.processed_shape(shape=()) != (): - raise TensorForceError("Invalid reward preprocessing!") - - # Deterministic/independent action flag (should probably be the same) - self.deterministic_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(), name='deterministic') - self.independent_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(), name='independent') - - # TensorFlow functions - self.fn_initialize = tf.make_template( - name_='initialize', - func_=self.tf_initialize, - custom_getter_=custom_getter - ) - self.fn_preprocess = tf.make_template( - name_='preprocess', - func_=self.tf_preprocess, - custom_getter_=custom_getter - ) - self.fn_actions_and_internals = tf.make_template( - name_='actions-and-internals', - func_=self.tf_actions_and_internals, - custom_getter_=custom_getter - ) - self.fn_observe_timestep = tf.make_template( - name_='observe-timestep', - func_=self.tf_observe_timestep, - custom_getter_=custom_getter - ) - self.fn_action_exploration = tf.make_template( - name_='action-exploration', - func_=self.tf_action_exploration, - custom_getter_=custom_getter - ) - - self.summary_configuration_op = None - if self.summarizer_spec and 'meta_param_recorder_class' in self.summarizer_spec: - self.summary_configuration_op = self.summarizer_spec['meta_param_recorder_class'].build_metagraph_list() - - # self.fn_summarization = tf.make_template( - # name_='summarization', - # func_=self.tf_summarization, - # custom_getter_=custom_getter - # ) - - def tf_initialize(self): - # Timestep - self.timestep = tf.get_variable( - name='timestep', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) - - # Episode - self.episode = tf.get_variable( - name='episode', - dtype=util.tf_dtype('int'), - initializer=0, - trainable=False - ) - - if self.batching_capacity is None: - capacity = 1 - else: - capacity = self.batching_capacity - - # States buffer variable - self.states_buffer = dict() - for name, state in self.states_spec.items(): - self.states_buffer[name] = tf.get_variable( - name=('state-' + name), - shape=((capacity,) + tuple(state['shape'])), - dtype=util.tf_dtype(state['type']), - trainable=False - ) - - # Internals buffer variable - self.internals_buffer = dict() - for name, internal in self.internals_spec.items(): - self.internals_buffer[name] = tf.get_variable( - name=('internal-' + name), - shape=((capacity,) + tuple(internal['shape'])), - dtype=util.tf_dtype(internal['type']), - trainable=False - ) - - # Actions buffer variable - self.actions_buffer = dict() - for name, action in self.actions_spec.items(): - self.actions_buffer[name] = tf.get_variable( - name=('action-' + name), - shape=((capacity,) + tuple(action['shape'])), - dtype=util.tf_dtype(action['type']), - trainable=False - ) - - # Buffer index - self.buffer_index = tf.get_variable( - name='buffer-index', - shape=(), - dtype=util.tf_dtype('int'), - trainable=False - ) - - def tf_preprocess(self, states, actions, reward): - # States preprocessing - for name, preprocessing in self.states_preprocessing.items(): - states[name] = preprocessing.process(tensor=states[name]) - - # Reward preprocessing - if self.reward_preprocessing is not None: - reward = self.reward_preprocessing.process(tensor=reward) - - return states, actions, reward - - def tf_action_exploration(self, action, exploration, action_spec): - """ - Applies optional exploration to the action (post-processor for action outputs). - - Args: - action (tf.Tensor): The original output action tensor (to be post-processed). - exploration (Exploration): The Exploration object to use. - action_spec (dict): Dict specifying the action space. - Returns: - The post-processed action output tensor. - """ - action_shape = tf.shape(input=action) - exploration_value = exploration.tf_explore( - episode=self.global_episode, - timestep=self.global_timestep, - action_spec=action_spec - ) - - if action_spec['type'] == 'bool': - action = tf.where( - condition=(tf.random_uniform(shape=action_shape[0]) < exploration_value), - x=(tf.random_uniform(shape=action_shape) < 0.5), - y=action - ) - - elif action_spec['type'] == 'int': - action = tf.where( - condition=(tf.random_uniform(shape=action_shape) < exploration_value), - x=tf.random_uniform(shape=action_shape, maxval=action_spec['num_actions'], dtype=util.tf_dtype('int')), - y=action - ) - - elif action_spec['type'] == 'float': - for _ in range(util.rank(action) - 1): - exploration_value = tf.expand_dims(input=exploration_value, axis=-1) - action += exploration_value - if 'min_value' in action_spec: - action = tf.clip_by_value( - t=action, - clip_value_min=action_spec['min_value'], - clip_value_max=action_spec['max_value'] - ) - - return action - - def tf_actions_and_internals(self, states, internals, deterministic): - """ - Creates and returns the TensorFlow operations for retrieving the actions and - if applicable - - the posterior internal state Tensors in reaction to the given input states (and prior internal states). - - Args: - states (dict): Dict of state tensors (each key represents one state space component). - internals: List of prior internal state tensors. - deterministic: Boolean tensor indicating whether action should be chosen - deterministically. - - Returns: - tuple: - 1) dict of output actions (with or without exploration applied (see `deterministic`)) - 2) list of posterior internal state Tensors (empty for non-internal state models) - """ - raise NotImplementedError - - def tf_observe_timestep(self, states, internals, actions, terminal, reward): - """ - Creates the TensorFlow operations for performing the observation of a full time step's - information. - - Args: - states (dict): Dict of state tensors (each key represents one state space component). - internals: List of prior internal state tensors. - actions: Dict of action tensors. - terminal: Terminal boolean tensor. - reward: Reward tensor. - - Returns: - The observation operation. - """ - raise NotImplementedError - - def create_act_operations(self, states, internals, deterministic, independent): - # Optional variable noise - operations = list() - if self.variable_noise is not None and self.variable_noise > 0.0: - # Initialize variables - self.fn_actions_and_internals( - states=states, - internals=internals, - deterministic=deterministic - ) - - noise_deltas = list() - for variable in self.get_variables(): - noise_delta = tf.random_normal(shape=util.shape(variable), mean=0.0, stddev=self.variable_noise) - noise_deltas.append(noise_delta) - operations.append(variable.assign_add(delta=noise_delta)) - - # Retrieve actions and internals - with tf.control_dependencies(control_inputs=operations): - self.actions_output, self.internals_output = self.fn_actions_and_internals( - states=states, - internals=internals, - deterministic=deterministic - ) - - # Subtract variable noise - with tf.control_dependencies(control_inputs=list(self.actions_output.values())): - operations = list() - if self.variable_noise is not None and self.variable_noise > 0.0: - for variable, noise_delta in zip(self.get_variables(), noise_deltas): - operations.append(variable.assign_sub(delta=noise_delta)) - - # Actions exploration - with tf.control_dependencies(control_inputs=operations): - for name, exploration in self.actions_exploration.items(): - self.actions_output[name] = tf.cond( - pred=self.deterministic_input, - true_fn=(lambda: self.actions_output[name]), - false_fn=(lambda: self.fn_action_exploration( - action=self.actions_output[name], - exploration=exploration, - action_spec=self.actions_spec[name] - )) - ) - - # Independent act not followed by observe. - def independent_act(): - return self.global_timestep - - # Normal act followed by observe, with additional operations. - def normal_act(): - # Store current states, internals and actions - operations = list() - batch_size = tf.shape(input=next(iter(states.values())))[0] - for name, state in states.items(): - operations.append(tf.assign( - ref=self.states_buffer[name][self.buffer_index: self.buffer_index + batch_size], - value=state - )) - for name, internal in internals.items(): - operations.append(tf.assign( - ref=self.internals_buffer[name][self.buffer_index: self.buffer_index + batch_size], - value=internal - )) - for name, action in self.actions_output.items(): - operations.append(tf.assign( - ref=self.actions_buffer[name][self.buffer_index: self.buffer_index + batch_size], - value=action - )) - - with tf.control_dependencies(control_inputs=operations): - operations = list() - - operations.append(tf.assign_add(ref=self.buffer_index, value=batch_size)) - - # Increment timestep - operations.append(tf.assign_add(ref=self.timestep, value=batch_size)) - operations.append(tf.assign_add(ref=self.global_timestep, value=batch_size)) - - with tf.control_dependencies(control_inputs=operations): - # Trivial operation to enforce control dependency - return self.global_timestep + 0 - - # Only increment timestep and update buffer if act not independent - self.timestep_output = tf.cond(pred=independent, true_fn=independent_act, false_fn=normal_act) - - def create_observe_operations(self, terminal, reward): - # Increment episode - num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int')) - increment_episode = tf.assign_add(ref=self.episode, value=num_episodes) - increment_global_episode = tf.assign_add(ref=self.global_episode, value=num_episodes) - - with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)): - # Stop gradients - fn = (lambda x: tf.stop_gradient(input=x[:self.buffer_index])) - states = util.map_tensors(fn=fn, tensors=self.states_buffer) - internals = util.map_tensors(fn=fn, tensors=self.internals_buffer) - actions = util.map_tensors(fn=fn, tensors=self.actions_buffer) - terminal = tf.stop_gradient(input=terminal) - reward = tf.stop_gradient(input=reward) - - # Observation - observation = self.fn_observe_timestep( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - with tf.control_dependencies(control_inputs=(observation,)): - # Reset index - reset_index = tf.assign(ref=self.buffer_index, value=0) - - with tf.control_dependencies(control_inputs=(reset_index,)): - # Trivial operation to enforce control dependency - self.episode_output = self.global_episode + 0 - - # TODO: add up rewards per episode and add summary_label 'episode-reward' - - def create_operations(self, states, internals, actions, terminal, reward, deterministic, independent): - """ - Creates output operations for acting, observing and interacting with the memory. - """ - self.create_act_operations( - states=states, - internals=internals, - deterministic=deterministic, - independent=independent - ) - self.create_observe_operations(reward=reward, terminal=terminal) - - def get_variables(self, include_submodules=False, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the model. - - Args: - include_submodules: Includes variables of submodules (e.g. baseline, target network) - if true. - include_nontrainable: Includes non-trainable variables if true. - - Returns: - List of variables. - """ - if include_nontrainable: - model_variables = [self.all_variables[key] for key in sorted(self.all_variables)] - - states_preprocessing_variables = [ - variable for preprocessing in self.states_preprocessing.values() - for variable in preprocessing.get_variables() - ] - model_variables += states_preprocessing_variables - - actions_exploration_variables = [ - variable for exploration in self.actions_exploration.values() - for variable in exploration.get_variables() - ] - model_variables += actions_exploration_variables - - if self.reward_preprocessing is not None: - reward_preprocessing_variables = self.reward_preprocessing.get_variables() - model_variables += reward_preprocessing_variables - - else: - model_variables = [self.variables[key] for key in sorted(self.variables)] - - return model_variables - - def get_summaries(self): - """ - Returns the TensorFlow summaries reported by the model - - Returns: - List of summaries - """ - return self.summaries - - def reset(self): - """ - Resets the model to its initial state on episode start. This should also reset all preprocessor(s). - - Returns: - tuple: - Current episode, timestep counter and the shallow-copied list of internal state initialization Tensors. - """ - - fetches = [self.global_episode, self.global_timestep] - - # Loop through all preprocessors and reset them as well. - for preprocessing in self.states_preprocessing.values(): - fetch = preprocessing.reset() - if fetch is not None: - fetches.extend(fetch) - - # Get the updated episode and timestep counts. - fetch_list = self.monitored_session.run(fetches=fetches) - episode, timestep = fetch_list[:2] - - return episode, timestep, self.internals_init - - def get_feed_dict( - self, - states=None, - internals=None, - actions=None, - terminal=None, - reward=None, - deterministic=None, - independent=None - ): - feed_dict = dict() - batched = None - - if states is not None: - if batched is None: - name = next(iter(states)) - state = np.asarray(states[name]) - batched = (state.ndim != len(self.states_spec[name]['unprocessed_shape'])) - if batched: - feed_dict.update({state_input: states[name] for name, state_input in self.states_input.items()}) - else: - feed_dict.update({state_input: (states[name],) for name, state_input in self.states_input.items()}) - - if internals is not None: - if batched is None: - name = next(iter(internals)) - internal = np.asarray(internals[name]) - batched = (internal.ndim != len(self.internals_spec[name]['shape'])) - if batched: - feed_dict.update({internal_input: internals[name] for name, internal_input in self.internals_input.items()}) - else: - feed_dict.update({internal_input: (internals[name],) for name, internal_input in self.internals_input.items()}) - - if actions is not None: - if batched is None: - name = next(iter(actions)) - action = np.asarray(actions[name]) - batched = (action.ndim != len(self.actions_spec[name]['shape'])) - if batched: - feed_dict.update({action_input: actions[name] for name, action_input in self.actions_input.items()}) - else: - feed_dict.update({action_input: (actions[name],) for name, action_input in self.actions_input.items()}) - - if terminal is not None: - if batched is None: - terminal = np.asarray(terminal) - batched = (terminal.ndim == 1) - if batched: - feed_dict[self.terminal_input] = terminal - else: - feed_dict[self.terminal_input] = (terminal,) - - if reward is not None: - if batched is None: - reward = np.asarray(reward) - batched = (reward.ndim == 1) - if batched: - feed_dict[self.reward_input] = reward - else: - feed_dict[self.reward_input] = (reward,) - - if deterministic is not None: - feed_dict[self.deterministic_input] = deterministic - - if independent is not None: - feed_dict[self.independent_input] = independent - - return feed_dict - - def act(self, states, internals, deterministic=False, independent=False, fetch_tensors=None): - """ - Does a forward pass through the model to retrieve action (outputs) given inputs for state (and internal - state, if applicable (e.g. RNNs)) - - Args: - states (dict): Dict of state values (each key represents one state space component). - internals (dict): Dict of internal state values (each key represents one internal state component). - deterministic (bool): If True, will not apply exploration after actions are calculated. - independent (bool): If true, action is not followed by observe (and hence not included - in updates). - - Returns: - tuple: - - Actual action-outputs (batched if state input is a batch). - - Actual values of internal states (if applicable) (batched if state input is a batch). - - The timestep (int) after calculating the (batch of) action(s). - """ - name = next(iter(states)) - state = np.asarray(states[name]) - batched = (state.ndim != len(self.states_spec[name]['unprocessed_shape'])) - if batched: - assert self.batching_capacity is not None and state.shape[0] <= self.batching_capacity - - fetches = [self.actions_output, self.internals_output, self.timestep_output] - if self.network is not None and fetch_tensors is not None: - for name in fetch_tensors: - valid, tensor = self.network.get_named_tensor(name) - if valid: - fetches.append(tensor) - else: - keys=self.network.get_list_of_named_tensor() - raise TensorForceError('Cannot fetch named tensor "{}", Available {}.'.format(name,keys)) - - # feed_dict = {state_input: states[name] for name, state_input in self.states_input.items()} - # feed_dict.update({internal_input: internals[n] for n, internal_input in enumerate(self.internals_input)}) - # else: - # feed_dict = {state_input: (states[name],) for name, state_input in self.states_input.items()} - # feed_dict.update({internal_input: (internals[n],) for n, internal_input in enumerate(self.internals_input)}) - - # feed_dict[self.deterministic_input] = deterministic - feed_dict = self.get_feed_dict( - states=states, - internals=internals, - deterministic=deterministic, - independent=independent - ) - - fetch_list = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) - actions, internals, timestep = fetch_list[0:3] - - # Extract the first (and only) action/internal from the batch to make return values non-batched - if not batched: - actions = {name: action[0] for name, action in actions.items()} - internals = {name: internal[0] for name, internal in internals.items()} - - if self.summary_configuration_op is not None: - summary_values = self.session.run(self.summary_configuration_op) - self.summarizer.add_summary(summary_values) - self.summarizer.flush() - # Only do this operation once to reduce duplicate data in Tensorboard - self.summary_configuration_op = None - - if self.network is not None and fetch_tensors is not None: - fetch_dict = dict() - for index, tensor in enumerate(fetch_list[3:]): - name = fetch_tensors[index] - fetch_dict[name] = tensor - return actions, internals, timestep, fetch_dict - else: - return actions, internals, timestep - - def observe(self, terminal, reward): - """ - Adds an observation (reward and is-terminal) to the model without updating its trainable variables. - - Args: - terminal (bool): Whether the episode has terminated. - reward (float): The observed reward value. - - Returns: - The value of the model-internal episode counter. - """ - # terminal = np.asarray(terminal) - # batched = (terminal.ndim == 1) - - fetches = self.episode_output - - feed_dict = self.get_feed_dict(terminal=terminal, reward=reward) - - # if batched: - # assert self.batching_capacity is not None and terminal.shape[0] <= self.batching_capacity - # feed_dict = {self.terminal_input: terminal, self.reward_input: reward, } - # else: - # feed_dict = {self.terminal_input: (terminal,), self.reward_input: (reward,)} - - self.is_observe = True - episode = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) - self.is_observe = False - - return episode - - def save(self, directory=None, append_timestep=True): - """ - Save TensorFlow model. If no checkpoint directory is given, the model's default saver - directory is used. Optionally appends current timestep to prevent overwriting previous - checkpoint files. Turn off to be able to load model from the same given path argument as - given here. - - Args: - directory: Optional checkpoint directory. - append_timestep: Appends the current timestep to the checkpoint file if true. - - Returns: - Checkpoint path were the model was saved. - """ - if self.summarizer_hook is not None: - self.summarizer_hook._summary_writer.flush() - - return self.saver.save( - sess=self.session, - save_path=(self.saver_directory if directory is None else directory), - global_step=(self.global_timestep if append_timestep else None), - # latest_filename=None, # Defaults to 'checkpoint'. - meta_graph_suffix='meta', - write_meta_graph=True, - write_state=True - ) - - def restore(self, directory=None, file=None): - """ - Restore TensorFlow model. If no checkpoint file is given, the latest checkpoint is - restored. If no checkpoint directory is given, the model's default saver directory is - used (unless file specifies the entire path). - - Args: - directory: Optional checkpoint directory. - file: Optional checkpoint file, or path if directory not given. - """ - if file is None: - file = tf.train.latest_checkpoint( - checkpoint_dir=(self.saver_directory if directory is None else directory), - # latest_filename=None # Corresponds to argument of saver.save() in Model.save(). - ) - elif directory is None: - file = os.path.join(self.saver_directory, file) - elif not os.path.isfile(file): - file = os.path.join(directory, file) - - # if not os.path.isfile(file): - # raise TensorForceError("Invalid model directory/file.") - - self.saver.restore(sess=self.session, save_path=file) diff --git a/tensorforce/models/pg_log_prob_model.py b/tensorforce/models/pg_log_prob_model.py deleted file mode 100755 index 7b58ac478..000000000 --- a/tensorforce/models/pg_log_prob_model.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util -from tensorforce.models import PGModel - - -class PGLogProbModel(PGModel): - """ - Policy gradient model based on computing log likelihoods, e.g. VPG. - """ - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - embedding = self.network.apply(x=states, internals=internals, update=update) - log_probs = list() - - for name, distribution in self.distributions.items(): - distr_params = distribution.parameterize(x=embedding) - log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) - collapsed_size = util.prod(util.shape(log_prob)[1:]) - log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) - log_probs.append(log_prob) - log_prob = tf.reduce_mean(input_tensor=tf.concat(values=log_probs, axis=1), axis=1) - return -log_prob * reward diff --git a/tensorforce/models/pg_model.py b/tensorforce/models/pg_model.py deleted file mode 100755 index 0e9f14592..000000000 --- a/tensorforce/models/pg_model.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce.core.baselines import Baseline, AggregatedBaseline -from tensorforce.core.optimizers import Optimizer -from tensorforce.models import DistributionModel - - -class PGModel(DistributionModel): - """ - Base class for policy gradient models. It optionally defines a baseline - and handles its optimization. It implements the `tf_loss_per_instance` function, but requires - subclasses to implement `tf_pg_loss_per_instance`. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - baseline_mode, - baseline, - baseline_optimizer, - gae_lambda - ): - # Baseline mode - assert baseline_mode is None or baseline_mode in ('states', 'network') - self.baseline_mode = baseline_mode - - self.baseline_spec = baseline - self.baseline_optimizer_spec = baseline_optimizer - - # Generalized advantage function - assert gae_lambda is None or (0.0 <= gae_lambda <= 1.0 and self.baseline_mode is not None) - self.gae_lambda = gae_lambda - - self.baseline = None - self.baseline_optimizer = None - self.fn_reward_estimation = None - - super(PGModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - requires_deterministic=False - ) - - def as_local_model(self): - super(PGModel, self).as_local_model() - if self.baseline_optimizer_spec is not None: - self.baseline_optimizer_spec = dict( - type='global_optimizer', - optimizer=self.baseline_optimizer_spec - ) - - def initialize(self, custom_getter): - super(PGModel, self).initialize(custom_getter) - - # Baseline - if self.baseline_spec is None: - assert self.baseline_mode is None - - elif all(name in self.states_spec for name in self.baseline_spec): - # Implies AggregatedBaseline. - assert self.baseline_mode == 'states' - self.baseline = AggregatedBaseline(baselines=self.baseline_spec) - - else: - assert self.baseline_mode is not None - self.baseline = Baseline.from_spec( - spec=self.baseline_spec, - kwargs=dict( - summary_labels=self.summary_labels - ) - ) - - # Baseline optimizer - if self.baseline_optimizer_spec is not None: - assert self.baseline_mode is not None - self.baseline_optimizer = Optimizer.from_spec(spec=self.baseline_optimizer_spec) - - # TODO: Baseline internal states !!! (see target_network q_model) - - # Reward estimation - self.fn_reward_estimation = tf.make_template( - name_='reward-estimation', - func_=self.tf_reward_estimation, - custom_getter_=custom_getter - ) - # Baseline loss - self.fn_baseline_loss = tf.make_template( - name_='baseline-loss', - func_=self.tf_baseline_loss, - custom_getter_=custom_getter - ) - - def tf_reward_estimation(self, states, internals, terminal, reward, update): - if self.baseline_mode is None: - return self.fn_discounted_cumulative_reward(terminal=terminal, reward=reward, discount=self.discount) - - else: - if self.baseline_mode == 'states': - state_value = self.baseline.predict( - states=states, - internals=internals, - update=update - ) - - elif self.baseline_mode == 'network': - embedding = self.network.apply( - x=states, - internals=internals, - update=update - ) - state_value = self.baseline.predict( - states=tf.stop_gradient(input=embedding), - internals=internals, - update=update - ) - - if self.gae_lambda is None: - reward = self.fn_discounted_cumulative_reward( - terminal=terminal, - reward=reward, - discount=self.discount - ) - advantage = reward - state_value - - else: - next_state_value = tf.concat(values=(state_value[1:], (0.0,)), axis=0) - zeros = tf.zeros_like(tensor=next_state_value) - next_state_value = tf.where(condition=terminal, x=zeros, y=next_state_value) - td_residual = reward + self.discount * next_state_value - state_value - gae_discount = self.discount * self.gae_lambda - advantage = self.fn_discounted_cumulative_reward( - terminal=terminal, - reward=td_residual, - discount=gae_discount - ) - - # Normalize advantage. - # mean, variance = tf.nn.moments(advantage, axes=[0], keep_dims=True) - # advantage = (advantage - mean) / tf.sqrt(x=tf.maximum(x=variance, y=util.epsilon)) - - return advantage - - def tf_regularization_losses(self, states, internals, update): - losses = super(PGModel, self).tf_regularization_losses( - states=states, - internals=internals, - update=update - ) - - if self.baseline_mode is not None and self.baseline_optimizer is None: - baseline_regularization_loss = self.baseline.regularization_loss() - if baseline_regularization_loss is not None: - losses['baseline'] = baseline_regularization_loss - - return losses - - def tf_baseline_loss(self, states, internals, reward, update, reference=None): - """ - Creates the TensorFlow operations for calculating the baseline loss of a batch. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - reward: Reward tensor. - update: Boolean tensor indicating whether this call happens during an update. - reference: Optional reference tensor(s), in case of a comparative loss. - - Returns: - Loss tensor. - """ - if self.baseline_mode == 'states': - loss = self.baseline.loss( - states=states, - internals=internals, - reward=reward, - update=update, - reference=reference - ) - - elif self.baseline_mode == 'network': - loss = self.baseline.loss( - states=self.network.apply(x=states, internals=internals, update=update), - internals=internals, - reward=reward, - update=update, - reference=reference - ) - - regularization_loss = self.baseline.regularization_loss() - if regularization_loss is not None: - loss += regularization_loss - - return loss - - def baseline_optimizer_arguments(self, states, internals, reward): - """ - Returns the baseline optimizer arguments including the time, the list of variables to - optimize, and various functions which the optimizer might require to perform an update - step. - - Args: - states: Dict of state tensors. - internals: List of prior internal state tensors. - reward: Reward tensor. - - Returns: - Baseline optimizer arguments as dict. - """ - arguments = dict( - time=self.global_timestep, - variables=self.baseline.get_variables(), - arguments=dict( - states=states, - internals=internals, - reward=reward, - update=tf.constant(value=True), - ), - fn_reference=self.baseline.reference, - fn_loss=self.fn_baseline_loss, - # source_variables=self.network.get_variables() - ) - if self.global_model is not None: - arguments['global_variables'] = self.global_model.baseline.get_variables() - return arguments - - def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): - assert next_states is None and next_internals is None # temporary - - estimated_reward = self.fn_reward_estimation( - states=states, - internals=internals, - terminal=terminal, - reward=reward, - update=tf.constant(value=True) - ) - if self.baseline_optimizer is not None: - estimated_reward = tf.stop_gradient(input=estimated_reward) - - optimization = super(PGModel, self).tf_optimization( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=estimated_reward, - next_states=next_states, - next_internals=next_internals - ) - - if self.baseline_optimizer is not None: - cumulative_reward = self.fn_discounted_cumulative_reward(terminal=terminal, reward=reward, discount=self.discount) - - arguments = self.baseline_optimizer_arguments( - states=states, - internals=internals, - reward=cumulative_reward, - ) - baseline_optimization = self.baseline_optimizer.minimize(**arguments) - - optimization = tf.group(optimization, baseline_optimization) - - return optimization - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(PGModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - if self.baseline_mode is not None and (include_submodules or self.baseline_optimizer is None): - baseline_variables = self.baseline.get_variables(include_nontrainable=include_nontrainable) - model_variables += baseline_variables - - if include_nontrainable and self.baseline_optimizer is not None: - baseline_optimizer_variables = self.baseline_optimizer.get_variables() - # For some reason, some optimizer variables are only registered in the model. - for variable in baseline_optimizer_variables: - if variable in model_variables: - model_variables.remove(variable) - model_variables += baseline_optimizer_variables - - return model_variables - - def get_summaries(self): - if self.baseline_mode is None: - return super(PGModel, self).get_summaries() - else: - return super(PGModel, self).get_summaries() + self.baseline.get_summaries() diff --git a/tensorforce/models/pg_prob_ratio_model.py b/tensorforce/models/pg_prob_ratio_model.py deleted file mode 100755 index 0be2c778b..000000000 --- a/tensorforce/models/pg_prob_ratio_model.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import tensorflow as tf - -from tensorforce import util -from tensorforce.models import PGModel - - -class PGProbRatioModel(PGModel): - """ - Policy gradient model based on computing likelihood ratios, e.g. TRPO and PPO. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - baseline_mode, - baseline, - baseline_optimizer, - gae_lambda, - likelihood_ratio_clipping - ): - # Likelihood ratio clipping - assert likelihood_ratio_clipping is None or likelihood_ratio_clipping > 0.0 - self.likelihood_ratio_clipping = likelihood_ratio_clipping - - # self.reference = None - # self.compare = None - - super(PGProbRatioModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - baseline_mode=baseline_mode, - baseline=baseline, - baseline_optimizer=baseline_optimizer, - gae_lambda=gae_lambda - ) - - def tf_reference(self, states, internals, actions, terminal, reward, next_states, next_internals, update): - embedding = self.network.apply(x=states, internals=internals, update=update) - - log_probs = list() - for name in sorted(self.distributions): - distribution = self.distributions[name] - distr_params = distribution.parameterize(x=embedding) - log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) - collapsed_size = util.prod(util.shape(log_prob)[1:]) - log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) - log_probs.append(log_prob) - - return tf.stop_gradient(input=tf.concat(values=log_probs, axis=1)) - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - embedding = self.network.apply(x=states, internals=internals, update=update) - - log_probs = list() - for name in sorted(self.distributions): - distribution = self.distributions[name] - distr_params = distribution.parameterize(x=embedding) - log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) - collapsed_size = util.prod(util.shape(log_prob)[1:]) - log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) - log_probs.append(log_prob) - - log_prob = tf.concat(values=log_probs, axis=1) - if reference is None: - old_log_prob = tf.stop_gradient(input=log_prob) - else: - old_log_prob = reference - - prob_ratio = tf.exp(x=(log_prob - old_log_prob)) - prob_ratio = tf.reduce_mean(input_tensor=prob_ratio, axis=1) - - if self.likelihood_ratio_clipping is None: - return -prob_ratio * reward - - else: - clipped_prob_ratio = tf.clip_by_value( - t=prob_ratio, - clip_value_min=(1.0 / (1.0 + self.likelihood_ratio_clipping)), - clip_value_max=(1.0 + self.likelihood_ratio_clipping) - ) - return -tf.minimum(x=(prob_ratio * reward), y=(clipped_prob_ratio * reward)) diff --git a/tensorforce/models/q_demo_model.py b/tensorforce/models/q_demo_model.py deleted file mode 100644 index 806c9cbec..000000000 --- a/tensorforce/models/q_demo_model.py +++ /dev/null @@ -1,334 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util, TensorForceError -from tensorforce.core.memories import Replay -from tensorforce.models import QModel - - -class QDemoModel(QModel): - """ - Model for deep Q-learning from demonstration. Principal structure similar to double - deep Q-networks but uses additional loss terms for demo data. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - target_sync_frequency, - target_update_weight, - double_q_model, - huber_loss, - expert_margin, - supervised_weight, - demo_memory_capacity, - demo_batch_size - ): - if any(action['type'] not in ('bool', 'int') for action in actions.values()): - raise TensorForceError("Invalid action type, only 'bool' and 'int' are valid!") - - self.expert_margin = expert_margin - self.supervised_weight = supervised_weight - self.demo_memory_capacity = demo_memory_capacity - self.demo_batch_size = demo_batch_size - - super(QDemoModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - target_sync_frequency=target_sync_frequency, - target_update_weight=target_update_weight, - double_q_model=double_q_model, - huber_loss=huber_loss - ) - - def initialize(self, custom_getter): - super(QDemoModel, self).initialize(custom_getter=custom_getter) - - self.demo_memory = Replay( - states=self.states_spec, - internals=self.internals_spec, - actions=self.actions_spec, - include_next_states=True, - capacity=self.demo_memory_capacity, - scope='demo-replay', - summary_labels=self.summary_labels - ) - - # Import demonstration optimization. - self.fn_import_demo_experience = tf.make_template( - name_='import-demo-experience', - func_=self.tf_import_demo_experience, - custom_getter_=custom_getter - ) - - # Demonstration loss. - self.fn_demo_loss = tf.make_template( - name_='demo-loss', - func_=self.tf_demo_loss, - custom_getter_=custom_getter - ) - - # Combined loss. - self.fn_combined_loss = tf.make_template( - name_='combined-loss', - func_=self.tf_combined_loss, - custom_getter_=custom_getter - ) - - # Demonstration optimization. - self.fn_demo_optimization = tf.make_template( - name_='demo-optimization', - func_=self.tf_demo_optimization, - custom_getter_=custom_getter - ) - - def tf_initialize(self): - super(QDemoModel, self).tf_initialize() - self.demo_memory.initialize() - - def tf_import_demo_experience(self, states, internals, actions, terminal, reward): - """ - Imports a single experience to memory. - """ - return self.demo_memory.store( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - def tf_demo_loss(self, states, actions, terminal, reward, internals, update, reference=None): - """ - Extends the q-model loss via the dqfd large-margin loss. - """ - embedding = self.network.apply(x=states, internals=internals, update=update) - deltas = list() - - for name, action in actions.items(): - distr_params = self.distributions[name].parameterize(x=embedding) - state_action_value = self.distributions[name].state_action_value(distr_params=distr_params, action=action) - - # Create the supervised margin loss - # Zero for the action taken, one for all other actions, now multiply by expert margin - if self.actions_spec[name]['type'] == 'bool': - num_actions = 2 - action = tf.cast(x=action, dtype=util.tf_dtype('int')) - else: - num_actions = self.actions_spec[name]['num_actions'] - - one_hot = tf.one_hot(indices=action, depth=num_actions) - ones = tf.ones_like(tensor=one_hot, dtype=tf.float32) - inverted_one_hot = ones - one_hot - - # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others - state_action_values = self.distributions[name].state_action_value(distr_params=distr_params) - state_action_values = state_action_values + inverted_one_hot * self.expert_margin - supervised_selector = tf.reduce_max(input_tensor=state_action_values, axis=-1) - - # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) - delta = supervised_selector - state_action_value - - action_size = util.prod(self.actions_spec[name]['shape']) - delta = tf.reshape(tensor=delta, shape=(-1, action_size)) - deltas.append(delta) - - loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) - loss_per_instance = tf.square(x=loss_per_instance) - - return tf.reduce_mean(input_tensor=loss_per_instance, axis=0) - - def tf_combined_loss(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - """ - Combines Q-loss and demo loss. - """ - q_model_loss = self.fn_loss( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals, - update=update, - reference=reference - ) - - demo_loss = self.fn_demo_loss( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - update=update, - reference=reference - ) - - return q_model_loss + self.supervised_weight * demo_loss - - def tf_demo_optimization(self, states, internals, actions, terminal, reward, next_states, next_internals): - arguments = dict( - time=self.global_timestep, - variables=self.get_variables(), - arguments=dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals, - update=tf.constant(value=True) - ), - fn_loss=self.fn_combined_loss - ) - demo_optimization = self.optimizer.minimize(**arguments) - - arguments = self.target_optimizer_arguments() - target_optimization = self.target_optimizer.minimize(**arguments) - - return tf.group(demo_optimization, target_optimization) - - def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): - optimization = super(QDemoModel, self).tf_optimization( - states=states, - internals=internals, - actions=actions, - reward=reward, - terminal=terminal, - next_states=next_states, - next_internals=next_internals - ) - - demo_batch = self.demo_memory.retrieve_timesteps(n=self.demo_batch_size) - demo_optimization = self.fn_demo_optimization(**demo_batch) - - return tf.group(optimization, demo_optimization) - - def create_operations(self, states, internals, actions, terminal, reward, deterministic, independent): - # Import demo experience operation. - self.import_demo_experience_output = self.fn_import_demo_experience( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - # !!! - super(QDemoModel, self).create_operations( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - deterministic=deterministic, - independent=independent - ) - - # Demo optimization operation. - demo_batch = self.demo_memory.retrieve_timesteps(n=self.demo_batch_size) - self.demo_optimization_output = self.fn_demo_optimization(**demo_batch) - - def get_variables(self, include_submodules=False, include_nontrainable=False): - """ - Returns the TensorFlow variables used by the model. - - Returns: - List of variables. - """ - model_variables = super(QDemoModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - if include_nontrainable: - demo_memory_variables = self.demo_memory.get_variables() - model_variables += demo_memory_variables - - return model_variables - - def get_summaries(self): - model_summaries = super(QDemoModel, self).get_summaries() - demo_memory_summaries = self.demo_memory.get_summaries() - - return model_summaries + demo_memory_summaries - - def import_demo_experience(self, states, internals, actions, terminal, reward): - """ - Stores demonstrations in the demo memory. - """ - fetches = self.import_demo_experience_output - - feed_dict = self.get_feed_dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - - self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) - - def demo_update(self): - """ - Performs a demonstration update by calling the demo optimization operation. - Note that the batch data does not have to be fetched from the demo memory as this is now part of - the TensorFlow operation of the demo update. - """ - fetches = self.demo_optimization_output - - self.monitored_session.run(fetches=fetches) diff --git a/tensorforce/models/q_model.py b/tensorforce/models/q_model.py deleted file mode 100755 index 57f14bcbc..000000000 --- a/tensorforce/models/q_model.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.models import DistributionModel -from tensorforce.core.networks import Network -from tensorforce.core.optimizers import Optimizer - - -class QModel(DistributionModel): - """ - Q-value model. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - target_sync_frequency, - target_update_weight, - double_q_model, - huber_loss - ): - self.target_network_spec = network - self.target_optimizer_spec = dict( - type='synchronization', - sync_frequency=target_sync_frequency, - update_weight=target_update_weight - ) - self.double_q_model = double_q_model - - # Huber loss - assert huber_loss is None or huber_loss > 0.0 - self.huber_loss = huber_loss - - self.target_network = None - self.target_optimizer = None - self.target_distributions = None - - super(QModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - requires_deterministic=True - ) - - def as_local_model(self): - super(QModel, self).as_local_model() - self.target_optimizer_spec = dict( - type='global_optimizer', - optimizer=self.target_optimizer_spec - ) - - def initialize(self, custom_getter): - super(QModel, self).initialize(custom_getter) - - # # TEMP: Random sampling fix - # if self.random_sampling_fix: - # self.next_states_input = dict() - # for name, state in self.states_spec.items(): - # self.next_states_input[name] = tf.placeholder( - # dtype=util.tf_dtype(state['type']), - # shape=(None,) + tuple(state['shape']), - # name=('next-' + name) - # ) - - # Target network - self.target_network = Network.from_spec( - spec=self.target_network_spec, - kwargs=dict(scope='target', summary_labels=self.summary_labels) - ) - - # Target network optimizer - self.target_optimizer = Optimizer.from_spec(spec=self.target_optimizer_spec) - - # Target network distributions - self.target_distributions = self.create_distributions() - - def tf_q_value(self, embedding, distr_params, action, name): - # Mainly for NAF. - return self.distributions[name].state_action_value(distr_params=distr_params, action=action) - - def tf_q_delta(self, q_value, next_q_value, terminal, reward): - """ - Creates the deltas (or advantage) of the Q values. - - :return: A list of deltas per action - """ - for _ in range(util.rank(q_value) - 1): - terminal = tf.expand_dims(input=terminal, axis=1) - reward = tf.expand_dims(input=reward, axis=1) - - multiples = (1,) + util.shape(q_value)[1:] - terminal = tf.tile(input=terminal, multiples=multiples) - reward = tf.tile(input=reward, multiples=multiples) - - zeros = tf.zeros_like(tensor=next_q_value) - next_q_value = tf.where(condition=terminal, x=zeros, y=(self.discount * next_q_value)) - - return reward + next_q_value - q_value # tf.stop_gradient(q_target) - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - embedding = self.network.apply(x=states, internals=internals, update=update) - - # fix - if self.double_q_model: - next_embedding = self.network.apply( - x=next_states, - internals=next_internals, - update=update - ) - - # Both networks can use the same internals, could that be a problem? - # Otherwise need to handle internals indices correctly everywhere - target_embedding = self.target_network.apply( - x=next_states, - internals=next_internals, - update=update - ) - - deltas = list() - for name, distribution in self.distributions.items(): - target_distribution = self.target_distributions[name] - - distr_params = distribution.parameterize(x=embedding) - target_distr_params = target_distribution.parameterize(x=target_embedding) - - q_value = self.tf_q_value(embedding=embedding, distr_params=distr_params, action=actions[name], name=name) - - if self.double_q_model: - # fix - next_distr_params = distribution.parameterize(x=next_embedding) - action_taken = distribution.sample(distr_params=next_distr_params, deterministic=True) - else: - action_taken = target_distribution.sample(distr_params=target_distr_params, deterministic=True) - - next_q_value = target_distribution.state_action_value(distr_params=target_distr_params, action=action_taken) - - delta = self.tf_q_delta(q_value=q_value, next_q_value=next_q_value, terminal=terminal, reward=reward) - - collapsed_size = util.prod(util.shape(delta)[1:]) - delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) - - deltas.append(delta) - - # Surrogate loss as the mean squared error between actual observed rewards and expected rewards - loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) - # Optional Huber loss - if self.huber_loss is not None and self.huber_loss > 0.0: - loss = tf.where( - condition=(tf.abs(x=loss_per_instance) <= self.huber_loss), - x=(0.5 * tf.square(x=loss_per_instance)), - y=(self.huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss)) - ) - else: - loss = tf.square(x=loss_per_instance) - - return loss - - def target_optimizer_arguments(self): - """ - Returns the target optimizer arguments including the time, the list of variables to - optimize, and various functions which the optimizer might require to perform an update - step. - - Returns: - Target optimizer arguments as dict. - """ - variables = self.target_network.get_variables() + [ - variable for name in sorted(self.target_distributions) - for variable in self.target_distributions[name].get_variables() - ] - source_variables = self.network.get_variables() + [ - variable for name in sorted(self.distributions) - for variable in self.distributions[name].get_variables() - ] - arguments = dict( - time=self.global_timestep, - variables=variables, - source_variables=source_variables - ) - if self.global_model is not None: - arguments['global_variables'] = self.global_model.target_network.get_variables() + [ - variable for name in sorted(self.global_model.target_distributions) - for variable in self.global_model.target_distributions[name].get_variables() - ] - return arguments - - def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): - optimization = super(QModel, self).tf_optimization( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward, - next_states=next_states, - next_internals=next_internals - ) - - arguments = self.target_optimizer_arguments() - target_optimization = self.target_optimizer.minimize(**arguments) - - return tf.group(optimization, target_optimization) - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(QModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - if include_submodules: - target_variables = self.target_network.get_variables(include_nontrainable=include_nontrainable) - model_variables += target_variables - - target_distributions_variables = [ - variable for name in sorted(self.target_distributions) - for variable in self.target_distributions[name].get_variables(include_nontrainable=include_nontrainable) - ] - model_variables += target_distributions_variables - - if include_nontrainable: - target_optimizer_variables = self.target_optimizer.get_variables() - model_variables += target_optimizer_variables - - return model_variables - - def get_summaries(self): - target_network_summaries = self.target_network.get_summaries() - target_distributions_summaries = [ - summary for name in sorted(self.target_distributions) - for summary in self.target_distributions[name].get_summaries() - ] - - return super(QModel, self).get_summaries() + target_network_summaries + target_distributions_summaries - - # # TEMP: Random sampling fix - # def update(self, states, internals, actions, terminal, reward, return_loss_per_instance=False): - # fetches = [self.optimization] - - # # Optionally fetch loss per instance - # if return_loss_per_instance: - # fetches.append(self.loss_per_instance) - - # terminal = np.asarray(terminal) - # batched = (terminal.ndim == 1) - # if batched: - # # TEMP: Random sampling fix - # if self.random_sampling_fix: - # feed_dict = {state_input: states[name][0] for name, state_input in self.states_input.items()} - # feed_dict.update({state_input: states[name][1] for name, state_input in self.next_states_input.items()}) - # else: - # feed_dict = {state_input: states[name] for name, state_input in self.states_input.items()} - # feed_dict.update( - # {internal_input: internals[n] - # for n, internal_input in enumerate(self.internals_input)} - # ) - # feed_dict.update( - # {action_input: actions[name] - # for name, action_input in self.actions_input.items()} - # ) - # feed_dict[self.terminal_input] = terminal - # feed_dict[self.reward_input] = reward - # else: - # # TEMP: Random sampling fix - # if self.random_sampling_fix: - # raise TensorForceError("Unbatched version not covered by fix.") - # else: - # feed_dict = {state_input: (states[name],) for name, state_input in self.states_input.items()} - # feed_dict.update( - # {internal_input: (internals[n],) - # for n, internal_input in enumerate(self.internals_input)} - # ) - # feed_dict.update( - # {action_input: (actions[name],) - # for name, action_input in self.actions_input.items()} - # ) - # feed_dict[self.terminal_input] = (terminal,) - # feed_dict[self.reward_input] = (reward,) - - # feed_dict[self.deterministic_input] = True - # feed_dict[self.update_input] = True - - # fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) - - # if return_loss_per_instance: - # return fetched[1] diff --git a/tensorforce/models/q_naf_model.py b/tensorforce/models/q_naf_model.py deleted file mode 100755 index 129166354..000000000 --- a/tensorforce/models/q_naf_model.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from six.moves import xrange - -import tensorflow as tf - -from tensorforce import util, TensorForceError -from tensorforce.models import QModel -from tensorforce.core.networks import Linear - - -class QNAFModel(QModel): - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity, - variable_noise, - states_preprocessing, - actions_exploration, - reward_preprocessing, - update_mode, - memory, - optimizer, - discount, - network, - distributions, - entropy_regularization, - target_sync_frequency, - target_update_weight, - double_q_model, - huber_loss - ): - if any(action['type'] != 'float' or 'min_value' in action or 'max_value' in action for action in actions.values()): - raise TensorForceError("Only unconstrained float actions valid for NAFModel.") - - super(QNAFModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=variable_noise, - states_preprocessing=states_preprocessing, - actions_exploration=actions_exploration, - reward_preprocessing=reward_preprocessing, - update_mode=update_mode, - memory=memory, - optimizer=optimizer, - discount=discount, - network=network, - distributions=distributions, - entropy_regularization=entropy_regularization, - target_sync_frequency=target_sync_frequency, - target_update_weight=target_update_weight, - double_q_model=double_q_model, - huber_loss=huber_loss - ) - - def initialize(self, custom_getter): - super(QNAFModel, self).initialize(custom_getter) - - self.state_values = dict() - self.l_entries = dict() - for name, action in self.actions_spec.items(): - num_action = util.prod(action['shape']) - self.state_values[name] = Linear(size=num_action, scope='state-value') - self.l_entries[name] = Linear(size=(num_action * (num_action - 1) // 2), scope='l-entries') - - def tf_q_value(self, embedding, distr_params, action, name): - num_action = util.prod(self.actions_spec[name]['shape']) - - mean, stddev, _ = distr_params - flat_mean = tf.reshape(tensor=mean, shape=(-1, num_action)) - flat_stddev = tf.reshape(tensor=stddev, shape=(-1, num_action)) - - # Advantage computation - # Network outputs entries of lower triangular matrix L - if self.l_entries[name] is None: - l_matrix = flat_stddev - l_matrix = tf.exp(l_matrix) - else: - l_matrix = tf.map_fn(fn=tf.diag, elems=flat_stddev) - - l_entries = self.l_entries[name].apply(x=embedding) - l_entries = tf.exp(l_entries) - offset = 0 - columns = list() - for zeros, size in enumerate(xrange(num_action - 1, -1, -1), 1): - column = tf.pad(tensor=l_entries[:, offset: offset + size], paddings=((0, 0), (zeros, 0))) - columns.append(column) - offset += size - - l_matrix += tf.stack(values=columns, axis=1) - - # P = LL^T - p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) - # A = -0.5 (a - mean)P(a - mean) - flat_action = tf.reshape(tensor=action, shape=(-1, num_action)) - difference = flat_action - flat_mean - advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) - advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) - advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) - - # Q = A + V - # State-value function - state_value = self.state_values[name].apply(x=embedding) - q_value = state_value + advantage - - return tf.reshape(tensor=q_value, shape=((-1,) + self.actions_spec[name]['shape'])) - - def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): - # Michael: doubling this function because NAF needs V'(s) not Q'(s), see comment below - embedding = self.network.apply(x=states, internals=internals, update=update) - - # Both networks can use the same internals, could that be a problem? - # Otherwise need to handle internals indices correctly everywhere - target_embedding = self.target_network.apply( - x=next_states, - internals=next_internals, - update=update - ) - - deltas = list() - for name, distribution in self.distributions.items(): - target_distribution = self.target_distributions[name] - - distr_params = distribution.parameterize(x=embedding) - target_distr_params = target_distribution.parameterize(x=target_embedding) - - q_value = self.tf_q_value(embedding=embedding, distr_params=distr_params, action=actions[name], name=name) - - # Notice, this is V', not Q' because NAF outputs V(s) separately - next_state_value = target_distribution.state_value(distr_params=target_distr_params) - - delta = self.tf_q_delta(q_value=q_value, next_q_value=next_state_value, terminal=terminal, reward=reward) - - collapsed_size = util.prod(util.shape(delta)[1:]) - delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) - - deltas.append(delta) - - # Surrogate loss as the mean squared error between actual observed rewards and expected rewards - loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) - - if self.huber_loss is not None and self.huber_loss > 0.0: - return tf.where( - condition=(tf.abs(x=loss_per_instance) <= self.huber_loss), - x=(0.5 * tf.square(x=loss_per_instance)), - y=(self.huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss)) - ) - else: - return tf.square(x=loss_per_instance) - - def tf_regularization_losses(self, states, internals, update): - losses = super(QNAFModel, self).tf_regularization_losses( - states=states, - internals=internals, - update=update - ) - - for state_value in self.state_values.values(): - regularization_loss = state_value.regularization_loss() - if regularization_loss is not None: - if 'state-values' in losses: - losses['state-values'] += regularization_loss - else: - losses['state-values'] = regularization_loss - - for l_entries in self.l_entries.values(): - regularization_loss = l_entries.regularization_loss() - if regularization_loss is not None: - if 'l-entries' in losses: - losses['l-entries'] += regularization_loss - else: - losses['l-entries'] = regularization_loss - - return losses - - def get_variables(self, include_submodules=False, include_nontrainable=False): - model_variables = super(QNAFModel, self).get_variables( - include_submodules=include_submodules, - include_nontrainable=include_nontrainable - ) - - state_values_variables = [ - variable for name in sorted(self.state_values) - for variable in self.state_values[name].get_variables() - ] - model_variables += state_values_variables - - l_entries_variables = [ - variable for name in sorted(self.l_entries) - for variable in self.l_entries[name].get_variables() - ] - model_variables += l_entries_variables - - return model_variables diff --git a/tensorforce/models/q_nstep_model.py b/tensorforce/models/q_nstep_model.py deleted file mode 100755 index 44c5d9fe9..000000000 --- a/tensorforce/models/q_nstep_model.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.models import QModel - - -class QNstepModel(QModel): - """ - Deep Q network using n-step rewards as described in Asynchronous Methods for Reinforcement Learning. - """ - - def tf_q_delta(self, q_value, next_q_value, terminal, reward): - for _ in range(util.rank(q_value) - 1): - terminal = tf.expand_dims(input=terminal, axis=1) - reward = tf.expand_dims(input=reward, axis=1) - - multiples = (1,) + util.shape(q_value)[1:] - terminal = tf.tile(input=terminal, multiples=multiples) - reward = tf.tile(input=reward, multiples=multiples) - - reward = self.fn_discounted_cumulative_reward( - terminal=terminal, - reward=reward, - discount=self.discount, - final_reward=next_q_value[-1] - ) - - return reward - q_value diff --git a/tensorforce/models/random_model.py b/tensorforce/models/random_model.py deleted file mode 100644 index 3b192a6f4..000000000 --- a/tensorforce/models/random_model.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from tensorforce import util -from tensorforce.models import Model - - -class RandomModel(Model): - """ - Utility class to return random actions of a desired shape and with given bounds. - """ - - def __init__( - self, - states, - actions, - scope, - device, - saver, - summarizer, - distributed, - batching_capacity - ): - super(RandomModel, self).__init__( - states=states, - actions=actions, - scope=scope, - device=device, - saver=saver, - summarizer=summarizer, - distributed=distributed, - batching_capacity=batching_capacity, - variable_noise=None, - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None - ) - - def tf_actions_and_internals(self, states, internals, deterministic): - assert len(internals) == 0 - - actions = dict() - for name, action in self.actions_spec.items(): - shape = (tf.shape(input=next(iter(states.values())))[0],) + action['shape'] - - if action['type'] == 'bool': - actions[name] = (tf.random_uniform(shape=shape) < 0.5) - - elif action['type'] == 'int': - actions[name] = tf.random_uniform(shape=shape, maxval=action['num_actions'], dtype=util.tf_dtype('int')) - - elif action['type'] == 'float': - if 'min_value' in action: - actions[name] = tf.random_uniform( - shape=shape, - minval=action['min_value'], - maxval=action['max_value'] - ) - - else: - actions[name] = tf.random_normal(shape=shape) - - return actions, dict() - - def tf_observe_timestep(self, states, internals, actions, terminal, reward): - return tf.no_op() diff --git a/tensorforce/tests/base_agent_test.py b/tensorforce/tests/base_agent_test.py deleted file mode 100755 index d32b7fcf7..000000000 --- a/tensorforce/tests/base_agent_test.py +++ /dev/null @@ -1,234 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from tensorforce.tests.base_test import BaseTest -from tensorforce.core.networks import Dense, LayerBasedNetwork -from tensorforce.environments import MinimalTest - - -class BaseAgentTest(BaseTest): - """ - Base class for tests of fundamental Agent functionality, i.e. various action types - and shapes and internal states. - """ - - config = None - multi_config = None - - # Exclude flags to indicate whether a certain test is excluded for a model. - exclude_bool = False - exclude_int = False - exclude_float = False - exclude_bounded = False - exclude_multi = False - exclude_lstm = False - - def test_bool(self): - """ - Tests the case of one boolean action. - """ - if self.__class__.exclude_bool: - return - - environment = MinimalTest(specification={'bool': ()}) - - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - self.base_test_pass( - name='bool', - environment=environment, - network=network, - **self.__class__.config - ) - - def test_int(self): - """ - Tests the case of one integer action. - """ - if self.__class__.exclude_int: - return - - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - self.base_test_pass( - name='int', - environment=environment, - network=network, - **self.__class__.config - ) - - def test_float(self): - """ - Tests the case of one float action. - """ - if self.__class__.exclude_float: - return - - environment = MinimalTest(specification={'float': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - self.base_test_pass( - name='float', - environment=environment, - network=network, - **self.__class__.config - ) - - def test_bounded_float(self): - """ - Tests the case of one bounded float action, i.e. with min and max value. - """ - if self.__class__.exclude_bounded: - return - - environment = MinimalTest(specification={'bounded': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - self.base_test_pass( - name='bounded', - environment=environment, - network=network, - **self.__class__.config - ) - - def test_multi(self): - """ - Tests the case of multiple actions of different type and shape. - """ - if self.__class__.exclude_multi: - return - - exclude_bool = self.__class__.exclude_bool - exclude_int = self.__class__.exclude_int - exclude_float = self.__class__.exclude_float - exclude_bounded = self.__class__.exclude_bounded - - class CustomNetwork(LayerBasedNetwork): - - def __init__(self, scope='layerbased-network', summary_labels=()): - super(CustomNetwork, self).__init__(scope=scope, summary_labels=summary_labels) - - if not exclude_bool: - self.layer_bool1 = Dense(size=16, scope='state-bool1') - self.add_layer(layer=self.layer_bool1) - self.layer_bool2 = Dense(size=16, scope='state-bool2') - self.add_layer(layer=self.layer_bool2) - - if not exclude_int: - self.layer_int1 = Dense(size=16, scope='state-int1') - self.add_layer(layer=self.layer_int1) - self.layer_int2 = Dense(size=16, scope='state-int2') - self.add_layer(layer=self.layer_int2) - - if not exclude_float: - self.layer_float1 = Dense(size=16, scope='state-float1') - self.add_layer(layer=self.layer_float1) - self.layer_float2 = Dense(size=16, scope='state-float2') - self.add_layer(layer=self.layer_float2) - - if not exclude_bounded: - self.layer_bounded1 = Dense(size=16, scope='state-bounded1') - self.add_layer(layer=self.layer_bounded1) - self.layer_bounded2 = Dense(size=16, scope='state-bounded2') - self.add_layer(layer=self.layer_bounded2) - - def tf_apply(self, x, internals, update, return_internals=False): - xs = list() - - if not exclude_bool: - xs.append(self.layer_bool2.apply( - x=self.layer_bool1.apply(x=x['bool'], update=update), update=update - )) - - if not exclude_int: - xs.append(self.layer_int2.apply( - x=self.layer_int1.apply(x=x['int'], update=update), update=update - )) - - if not exclude_float: - xs.append(self.layer_float2.apply( - x=self.layer_float1.apply(x=x['float'], update=update), update=update - )) - - if not exclude_bounded: - xs.append(self.layer_bounded2.apply( - x=self.layer_bounded1.apply(x=x['bounded'], update=update), update=update - )) - - x = xs[0] - for y in xs[1:]: - x *= y - - if return_internals: - return x, dict() - else: - return x - - specification = dict() - if not exclude_bool: - specification['bool'] = () - if not exclude_int: - specification['int'] = (2,) - if not exclude_float: - specification['float'] = (1, 1) - if not exclude_bounded: - specification['bounded'] = (1,) - - environment = MinimalTest(specification=specification) - - self.base_test_run( - name='multi', - environment=environment, - network=CustomNetwork, - **self.__class__.config - ) - - def test_lstm(self): - """ - Tests the case of using internal states via an LSTM layer (for one integer action). - """ - if self.__class__.exclude_lstm: - return - - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32), - dict(type='internal_lstm', size=32) - ] - - self.base_test_pass( - name='lstm', - environment=environment, - network=network, - **self.__class__.config - ) diff --git a/tensorforce/tests/base_test.py b/tensorforce/tests/base_test.py deleted file mode 100755 index 4b63f43fa..000000000 --- a/tensorforce/tests/base_test.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import logging -from six.moves import xrange -import sys - -from tensorforce.execution import SingleRunner - - -logging.getLogger('tensorflow').disabled = True - - -class BaseTest(object): - """ - Base class for tests of Agent functionality. - """ - - agent = None - requires_network = True - pass_threshold = 0.8 - - def pre_run(self, agent, environment): - """ - Called before `Runner.run`. - """ - pass - - def base_test_pass(self, name, environment, network, **kwargs): - """ - Basic test loop, requires an Agent to achieve a certain performance on an environment. - - Args: - name (str): The name of the test. - environment (Environment): The Environment object to use for the test. - network (LayerBasedNetwork): The Network to use for the agent's model. - kwargs (any): Agent arguments. - """ - sys.stdout.write('\n{} ({}):'.format(self.__class__.agent.__name__, name)) - sys.stdout.flush() - - passed = 0 - for _ in xrange(3): - if self.__class__.requires_network: - agent = self.__class__.agent( - states=environment.states, - actions=environment.actions, - network=network, - **kwargs - ) - else: - agent = self.__class__.agent( - states=environment.states, - actions=environment.actions, - **kwargs - ) - - runner = SingleRunner(agent=agent, environment=environment) - self.pre_run(agent=agent, environment=environment) - - def episode_finished(r): - episodes_passed = [ - rw / ln >= self.__class__.pass_threshold - for rw, ln in zip(r.episode_rewards[-100:], r.episode_timesteps[-100:]) - ] - return r.episode < 100 or not all(episodes_passed) - - runner.run(episodes=2000, episode_finished=episode_finished) - runner.close() - - sys.stdout.write(' ' + str(runner.episode)) - sys.stdout.flush() - if all(rw / ln >= self.__class__.pass_threshold - for rw, ln in zip(runner.episode_rewards[-100:], runner.episode_timesteps[-100:])): - passed += 1 - if passed == 2: - break - - sys.stdout.write(' ==> {} passed\n'.format(passed)) - sys.stdout.flush() - self.assertTrue(passed >= 2) - - def base_test_run(self, name, environment, network, **kwargs): - """ - Run test, tests whether algorithm can run and update without compilation errors, - not whether it passes. - - Args: - name (str): The name of the test. - environment (Environment): The Environment object to use for the test. - network (LayerBasedNetwork): The Network to use for the agent's model. - kwargs (any): Agent arguments. - """ - - sys.stdout.write('\n{} ({}):'.format(self.__class__.agent.__name__, name)) - sys.stdout.flush() - - if self.__class__.requires_network: - agent = self.__class__.agent( - states=environment.states, - actions=environment.actions, - network=network, - **kwargs - ) - else: - agent = self.__class__.agent( - states=environment.states, - actions=environment.actions, - **kwargs - ) - - runner = SingleRunner(agent=agent, environment=environment) - self.pre_run(agent=agent, environment=environment) - - runner.run(num_episodes=100) - runner.close() - - sys.stdout.write(' ran\n') - sys.stdout.flush() diff --git a/tensorforce/tests/minimal_test.py b/tensorforce/tests/minimal_test.py deleted file mode 100755 index aa50d98bb..000000000 --- a/tensorforce/tests/minimal_test.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from random import random - -import numpy as np - -from tensorforce import util, TensorForceError -from tensorforce.environments import Environment - - -class MinimalTest(Environment): - - def __init__(self, specification): - """ - Initializes a minimal test environment, which is used for the unit tests. - Given a specification of actions types and shapes, the environment states consist - of the same number of pairs (x, y). The (mean of) an action a gives the next state via (1-a, a), - and the 'correct' state is always (0, 1). - - Args: - specification: Takes a dict type (keys)-> shape (values specifying the action - structure of the environment. Use shape () for single scalar actions. - """ - self.specification = dict() - for action_type, shape in specification.items(): - if action_type in ('bool', 'int', 'float', 'bounded'): - if isinstance(shape, int): - self.specification[action_type] = (shape,) - else: - self.specification[action_type] = tuple(shape) - else: - raise TensorForceError('Invalid MinimalTest specification.') - self.single_state_action = (len(specification) == 1) - - def __str__(self): - return 'MinimalTest' - - def close(self): - pass - - def reset(self): - self.state = {action_type: (1.0, 0.0) for action_type in self.specification} - if self.single_state_action: - return next(iter(self.state.values())) - else: - return dict(self.state) - - def execute(self, actions): - if self.single_state_action: - actions = {next(iter(self.specification)): actions} - - reward = 0.0 - for action_type, shape in self.specification.items(): - if action_type == 'bool' or action_type == 'int': - correct = np.sum(actions[action_type]) - overall = util.prod(shape) - self.state[action_type] = ((overall - correct) / overall, correct / overall) - elif action_type == 'float' or action_type == 'bounded': - step = np.sum(actions[action_type]) / util.prod(shape) - self.state[action_type] = max(self.state[action_type][0] - step, 0.0), min(self.state[action_type][1] + step, 1.0) - reward += max(min(self.state[action_type][1], 1.0), 0.0) - - terminal = random() < 0.25 - if self.single_state_action: - return next(iter(self.state.values())), terminal, reward - else: - reward = reward / len(self.specification) - return dict(self.state), terminal, reward - - @property - def states(self): - if self.single_state_action: - return dict(shape=2, type='float') - else: - return {action_type: dict(shape=(2,), type='float') for action_type in self.specification} - - @property - def actions(self): - if self.single_state_action: - action_type = next(iter(self.specification)) - if action_type == 'int': - return dict(type='int', num_actions=2) - elif action_type == 'bounded': - return dict(type='float', min_value=-0.5, max_value=1.5) - else: - return dict(type=action_type) - else: - actions = dict() - for action_type, shape in self.specification.items(): - if action_type == 'int': - actions[action_type] = dict(type='int', shape=shape, num_actions=2) - elif action_type == 'bounded': - actions[action_type] = dict(type='float', shape=shape, min_value=-0.5, max_value=1.5) - else: - actions[action_type] = dict(type=action_type, shape=shape) - return actions diff --git a/tensorforce/tests/test_constant_agent.py b/tensorforce/tests/test_constant_agent.py deleted file mode 100755 index 681449af7..000000000 --- a/tensorforce/tests/test_constant_agent.py +++ /dev/null @@ -1,41 +0,0 @@ - -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import ConstantAgent - - -class TestConstantAgent(BaseAgentTest, unittest.TestCase): - - agent = ConstantAgent - requires_network = False - - # Just testing float/bounded test (otherwise would have to use different config for each test) - config = dict( - action_values=dict( - action=1.0 - ) - ) - - exclude_bool = True - exclude_int = True - exclude_multi = True - exclude_lstm = True diff --git a/tensorforce/tests/test_dqfd_agent.py b/tensorforce/tests/test_dqfd_agent.py deleted file mode 100644 index 2509b8daa..000000000 --- a/tensorforce/tests/test_dqfd_agent.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -from six.moves import xrange -import unittest -import numpy as np - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce import util -from tensorforce.agents import DQFDAgent - - -class TestDQFDAgent(BaseAgentTest, unittest.TestCase): - - agent = DQFDAgent - config = dict( - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - target_sync_frequency=10, - demo_memory_capacity=100, - demo_sampling_ratio=0.2 - # first_update=10, - ) - - exclude_float = True - exclude_bounded = True - - def pre_run(self, agent, environment): - demonstrations = list() - - agent.reset() - internals = agent.current_internals - next_states = None - terminal = True - - for n in xrange(50): - if terminal: - states = environment.reset() - else: - assert next_states is not None - states = next_states - - actions = dict() - # Create demonstration actions of the right shape. - if 'type' in environment.actions: - if environment.actions['type'] == 'bool': - actions = np.full( - shape=(), - fill_value=True, - dtype=util.np_dtype(environment.actions['type']) - ) - elif environment.actions['type'] == 'int': - actions = np.full( - shape=(), - fill_value=1, - dtype=util.np_dtype(environment.actions['type']) - ) - elif environment.actions['type'] == 'float': - actions = np.full( - shape=(), - fill_value=1.0, - dtype=util.np_dtype(environment.actions['type']) - ) - else: - for name, action in environment.actions.items(): - if action['type'] == 'bool': - actions[name] = np.full( - shape=action['shape'], - fill_value=True, - dtype=util.np_dtype(action['type']) - ) - elif action['type'] == 'int': - actions[name] = np.full( - shape=action['shape'], - fill_value=1, - dtype=util.np_dtype(action['type']) - ) - elif action['type'] == 'float': - actions[name] = np.full( - shape=action['shape'], - fill_value=1.0, - dtype=util.np_dtype(action['type']) - ) - - next_states, terminal, reward = environment.execute(actions=actions) - - demonstration = dict( - states=states, - internals=internals, - actions=actions, - terminal=terminal, - reward=reward - ) - demonstrations.append(demonstration) - - agent.import_demonstrations(demonstrations=demonstrations) - agent.pretrain(steps=1000) - - # multi_config = dict( - # memory=dict( - # type='replay', - # capacity=1000 - # ), - # optimizer=dict( - # type="adam", - # learning_rate=0.01 - # ), - # repeat_update=1, - # batch_size=16, - # first_update=16, - # target_sync_frequency=10, - # demo_memory_capacity=100, - # demo_sampling_ratio=0.2 - # ) diff --git a/tensorforce/tests/test_dqn_agent.py b/tensorforce/tests/test_dqn_agent.py deleted file mode 100755 index a0e5fbb5e..000000000 --- a/tensorforce/tests/test_dqn_agent.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import DQNAgent - - -class TestDQNAgent(BaseAgentTest, unittest.TestCase): - - agent = DQNAgent - config = dict( - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - # memory=dict( - # type='prioritized_replay', - # include_next_states=True, - # buffer_size=20, - # capacity=1000 - # ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - states_preprocessing=[ - dict(type='running_standardize'), - dict(type='sequence') - ], - target_sync_frequency=10, - # Comment in to test exploration types - # actions_exploration_spec=dict( - # type="epsilon_decay", - # initial_epsilon=1.0, - # final_epsilon=0.1, - # timesteps=10 - # ), - # actions_exploration_spec=dict( - # type="epsilon_anneal", - # initial_epsilon=1.0, - # final_epsilon=0.1, - # timesteps=10 - # ) - ) - - exclude_float = True - exclude_bounded = True diff --git a/tensorforce/tests/test_dqn_nstep_agent.py b/tensorforce/tests/test_dqn_nstep_agent.py deleted file mode 100644 index 9923fbcd8..000000000 --- a/tensorforce/tests/test_dqn_nstep_agent.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import DQNNstepAgent - - -class TestDQNNstepAgent(BaseAgentTest, unittest.TestCase): - - agent = DQNNstepAgent - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=True, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - exclude_float = True - exclude_bounded = True - exclude_multi = True diff --git a/tensorforce/tests/test_naf_agent.py b/tensorforce/tests/test_naf_agent.py deleted file mode 100755 index 52751bcb3..000000000 --- a/tensorforce/tests/test_naf_agent.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import NAFAgent - - -class TestNAFAgent(BaseAgentTest, unittest.TestCase): - - agent = NAFAgent - config = dict( - actions_exploration=dict( - type='ornstein_uhlenbeck' - ), - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - target_sync_frequency=10 - # memory=dict( - # type='replay', - # capacity=1000 - # ), - # optimizer=dict( - # type='adam', - # learning_rate=0.001 - # ), - # batch_size=8, - # first_update=8, - # repeat_update=4 - ) - - exclude_bool = True - exclude_int = True - exclude_bounded = True - exclude_multi = True - exclude_lstm = True diff --git a/tensorforce/tests/test_ppo_agent.py b/tensorforce/tests/test_ppo_agent.py deleted file mode 100644 index a6a50770c..000000000 --- a/tensorforce/tests/test_ppo_agent.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import PPOAgent - - -class TestPPOAgent(BaseAgentTest, unittest.TestCase): - - agent = PPOAgent - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - step_optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - subsampling_fraction=0.3, - optimization_steps=20 - ) diff --git a/tensorforce/tests/test_quickstart_example.py b/tensorforce/tests/test_quickstart_example.py deleted file mode 100644 index 43d158c65..000000000 --- a/tensorforce/tests/test_quickstart_example.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import logging -import numpy as np -from six.moves import xrange -import sys -import unittest - -from tensorforce.agents import PPOAgent -from tensorforce.execution import Runner -from tensorforce.contrib.openai_gym import OpenAIGym - - -logging.getLogger('tensorflow').disabled = True - - -class TestQuickstartExample(unittest.TestCase): - - def test_example(self): - sys.stdout.write('\nQuickstart:\n') - sys.stdout.flush() - - passed = 0 - for _ in xrange(3): - - # Create an OpenAI-Gym environment - environment = OpenAIGym('CartPole-v0') - - # Network specification for the model - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - # Create the agent - agent = PPOAgent( - states=environment.states, - actions=environment.actions, - network=network, - # Model - states_preprocessing=None, - actions_exploration=None, - reward_preprocessing=None, - # MemoryModel - update_mode=dict( - unit='episodes', - # 10 episodes per update - batch_size=10, - # Every 10 episodes - frequency=10 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=5000 - ), - discount=0.99, - # DistributionModel - distributions=None, - entropy_regularization=0.01, - # PGModel - baseline_mode='states', - baseline=dict( - type='mlp', - sizes=[32, 32] - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ), - gae_lambda=None, - # PGLRModel - likelihood_ratio_clipping=0.2, - # PPOAgent - step_optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - subsampling_fraction=0.1, - optimization_steps=50 - ) - - # Initialize the runner - runner = Runner(agent=agent, environment=environment) - - # Function handle called after each finished episode - def episode_finished(r): - # Test if mean reward over 50 should ensure that learning took off - mean_reward = np.mean(r.episode_rewards[-50:]) - return r.episode < 100 or mean_reward < 50.0 - - # Start the runner - runner.run(episodes=2000, max_episode_timesteps=200, episode_finished=episode_finished) - runner.close() - - sys.stdout.write('episodes: {}\n'.format(runner.episode)) - sys.stdout.flush() - - # Test passed if episode_finished handle evaluated to False - if runner.episode < 2000: - passed += 1 - - sys.stdout.write('==> passed: {}\n'.format(passed)) - sys.stdout.flush() - self.assertTrue(passed >= 2) diff --git a/tensorforce/tests/test_random_agent.py b/tensorforce/tests/test_random_agent.py deleted file mode 100755 index 1aeb342bf..000000000 --- a/tensorforce/tests/test_random_agent.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import RandomAgent - - -class TestRandomAgent(BaseAgentTest, unittest.TestCase): - - agent = RandomAgent - requires_network = False - # Random agent is not expected to pass anything. - pass_threshold = 0.0 - - config = dict() - - # Not using a network so no point in testing LSTM. - exclude_lstm = True diff --git a/tensorforce/tests/test_reward_estimation.py b/tensorforce/tests/test_reward_estimation.py deleted file mode 100755 index c8390c8d0..000000000 --- a/tensorforce/tests/test_reward_estimation.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import logging -import numpy as np -import unittest - -from tensorforce.agents import VPGAgent -from tensorforce.core.baselines import Baseline - - -logging.getLogger('tensorflow').disabled = True - - -class TestRewardEstimation(unittest.TestCase): - #TODO - reward is now computed using TensorFlow operations and not easily - # accessible. Need to rethink how to test things like this. - - def test_basic(self): - - kwargs = dict( - discount=0.75, - batch_size=8, - learning_rate=0.001, - ) - # agent = VPGAgent( - # states_spec=dict(shape=(1,)), - # actions_spec=dict(type='int', num_actions=2), - # network_spec=[dict(type='dense', size=32)], - # config=config - # ) - - states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] - actions = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] - terminals = [False, False, False, False, True, False, False, False, True] - discounted_rewards = np.array([ - 0.75 + 0.75 ** 4, 1.0 + 0.75 ** 3, 0.75 ** 2, 0.75, 1.0, - 1.0 + 0.75 ** 2, 0.75, 1.0, 0.0 - ]) - - feed_dict=dict() - # feed_dict[agent.model.reward_input] = rewards - # fetches = [agent.model.get_reward()] - # result = agent.model.session.run(feed_dict=feed_dict, fetches=fetches) - - - expected = discounted_rewards - - #self.assertTrue((result == expected).all()) - - def test_baseline(self): - kwargs = dict( - discount=0.75, - batch_size=8, - learning_rate=0.001, - ) - # agent = VPGAgent( - # states_spec=dict(shape=(1,)), - # actions_spec=dict(type='int', num_actions=2), - # network_spec=[dict(type='dense', size=32)], - # config=config - # ) - - states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] - rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] - terminals = [False, False, False, False, True, False, False, False, True] - discounted_rewards = np.array([ - 0.75 + 0.75 ** 4, 1.0 + 0.75 ** 3, 0.75 ** 2, 0.75, 1.0, - 1.0 + 0.75 ** 2, 0.75, 1.0, 0.0 - ]) - baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) - #agent.model.baseline = dict(state=Baseline()) - #agent.model.baseline['state'].predict = lambda states: baseline - - #result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) - expected = discounted_rewards - baseline - #print(result) - #print(expected) - #self.assertTrue((result == expected).all()) - - def test_gae(self): - kwargs = dict( - discount=0.75, - batch_size=8, - learning_rate=0.001, - # gae_rewards=True, - # gae_lambda=0.5 - ) - # agent = VPGAgent( - # states_spec=dict(shape=(1,)), - # actions_spec=dict(type='int', num_actions=2), - # network_spec=[dict(type='dense', size=32)], - # config=config - # ) - - states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] - rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] - terminals = [False, False, False, False, True, False, False, False, True] - baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) - #agent.model.baseline = dict(state=Baseline()) - #agent.model.baseline['state'].predict = lambda states: baseline - td_residuals = np.array([ - 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0, - 1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0 - ]) - - # result, _ = agent.model.reward_estimation( - # states=dict(state=states), - # rewards=rewards, - # terminals=terminals - # ) - - expected = np.array([ - np.sum(((0.5 * 0.75) ** np.array([0, 1, 2, 3, 4])) * td_residuals[:5]), - np.sum(((0.5 * 0.75) ** np.array([0, 1, 2, 3])) * td_residuals[1:5]), - np.sum(((0.5 * 0.75) ** np.array([0, 1, 2])) * td_residuals[2:5]), - np.sum(((0.5 * 0.75) ** np.array([0, 1])) * td_residuals[3:5]), - np.sum(((0.5 * 0.75) ** np.array([0])) * td_residuals[4:5]), - np.sum(((0.5 * 0.75) ** np.array([0, 1, 2, 3])) * td_residuals[5:]), - np.sum(((0.5 * 0.75) ** np.array([0, 1, 2])) * td_residuals[6:]), - np.sum(((0.5 * 0.75) ** np.array([0, 1])) * td_residuals[7:]), - np.sum(((0.5 * 0.75) ** np.array([0])) * td_residuals[8:]) - ]) - #self.assertTrue((result == expected).all()) diff --git a/tensorforce/tests/test_trpo_agent.py b/tensorforce/tests/test_trpo_agent.py deleted file mode 100755 index b505bc1a1..000000000 --- a/tensorforce/tests/test_trpo_agent.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import TRPOAgent - - -class TestTRPOAgent(BaseAgentTest, unittest.TestCase): - - agent = TRPOAgent - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - learning_rate=1e-2 - ) - - # multi_config = dict( - # batch_size=64, - # learning_rate=0.1 - # ) diff --git a/tensorforce/tests/test_tutorial_code.py b/tensorforce/tests/test_tutorial_code.py deleted file mode 100644 index ff2e93425..000000000 --- a/tensorforce/tests/test_tutorial_code.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import logging -import unittest - - -logging.getLogger('tensorflow').disabled = True - - -class TestTutorialCode(unittest.TestCase): - """ - Validation of random code snippets as to be notified when old blog posts need to be changed. - """ - - class MyClient(object): - - def __init__(self, *args, **kwargs): - pass - - def get_state(self): - import numpy as np - return np.random.rand(10) - - def execute(self, action): - pass - - def test_reinforceio_homepage(self): - """ - Code example from the homepage and README.md. - """ - - from tensorforce.agents import TRPOAgent - - # Create a Trust Region Policy Optimization agent - agent = TRPOAgent( - states=dict(shape=(10,), type='float'), - actions=dict(type='int', num_actions=2), - network=[dict(type='dense', size=50), dict(type='dense', size=50)], - update_mode=dict( - unit='episodes', - batch_size=1, - frequency=1 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ) - ) - - # Get new data from somewhere, e.g. a client to a web app - client = TestTutorialCode.MyClient('http://127.0.0.1', 8080) - - # Poll new state from client - state = client.get_state() - - # Get prediction from agent, execute - action = agent.act(states=state) - reward = client.execute(action) - - # Add experience, agent automatically updates model according to batch size - agent.observe(reward=reward, terminal=False) - agent.close() - - def test_blogpost_introduction(self): - """ - Test of introduction blog post examples. - """ - import tensorflow as tf - - ### DQN agent example - from tensorforce.agents import DQNAgent - - # Network is an ordered list of layers - network_spec = [dict(type='dense', size=32), dict(type='dense', size=32)] - - # Define a state - states = dict(shape=(10,), type='float') - - # Define an action - actions = dict(type='int', num_actions=5) - - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - update_mode=dict( - unit='timesteps', - batch_size=1, - frequency=1 - ), - memory=dict( - type='latest', - include_next_states=True, - capacity=100 - ), - target_sync_frequency=10 - ) - - agent.close() - - ### Code block: multiple states - states = dict( - image=dict(shape=(64, 64, 3), type='float'), - caption=dict(shape=(20,), type='int') - ) - - # DQN does not support multiple states. Omit test for now. - # agent = DQNAgent(config=config) - - ### Code block: DQN observer function - - def observe(self, reward, terminal): - super(DQNAgent, self).observe(reward, terminal) - if self.timestep >= self.first_update \ - and self.timestep % self.target_update_frequency == 0: - self.model.update_target() - - ### Code block: Network config JSON - - network_json = """ - [ - { - "type": "conv2d", - "size": 32, - "window": 8, - "stride": 4 - }, - { - "type": "conv2d", - "size": 64, - "window": 4, - "stride": 2 - }, - { - "type": "flatten" - }, - { - "type": "dense", - "size": 512 - } - ] - """ - - ### Test json - - import json - network_spec = json.loads(network_json) - - ### Code block: Modified dense layer - - modified_dense = """ - [ - { - "type": "dense", - "size": 64, - "bias": false, - "activation": "selu", - "l2_regularization": 0.001 - } - ] - """ - - ### Test json - network_spec = json.loads(modified_dense) - - ### Code block: Own layer type - from tensorforce.core.networks import Layer - - class BatchNormalization(Layer): - - def __init__(self, variance_epsilon=1e-6, scope='batchnorm', summary_labels=None): - super(BatchNormalization, self).__init__(scope=scope, summary_labels=summary_labels) - self.variance_epsilon = variance_epsilon - - def tf_apply(self, x, update): - mean, variance = tf.nn.moments(x, axes=tuple(range(x.shape.ndims - 1))) - return tf.nn.batch_normalization( - x=x, - mean=mean, - variance=variance, - offset=None, - scale=None, - variance_epsilon=self.variance_epsilon - ) - - ### Test own layer - - states = dict(shape=(10,), type='float') - network_spec = [ - {'type': 'dense', 'size': 32}, - {'type': BatchNormalization, 'variance_epsilon': 1e-9} - ] - - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ) - ) - - agent.close() - - ### Code block: Own network builder - from tensorforce.core.networks import Network - - class CustomNetwork(Network): - - def tf_apply(self, x, internals, update, return_internals=False): - image = x['image'] # 64x64x3-dim, float - caption = x['caption'] # 20-dim, int - initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, dtype=tf.float32) - - # CNN - weights = tf.get_variable(name='W1', shape=(3, 3, 3, 16), initializer=initializer) - image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME') - image = tf.nn.relu(image) - image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME') - - weights = tf.get_variable(name='W2', shape=(3, 3, 16, 32), initializer=initializer) - image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME') - image = tf.nn.relu(image) - image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME') - - image = tf.reshape(image, shape=(-1, 16 * 16, 32)) - image = tf.reduce_mean(image, axis=1) - - # LSTM - weights = tf.get_variable(name='W3', shape=(30, 32), initializer=initializer) - caption = tf.nn.embedding_lookup(params=weights, ids=caption) - lstm = tf.contrib.rnn.LSTMCell(num_units=32) - caption, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=caption, dtype=tf.float32) - caption = tf.reduce_mean(caption, axis=1) - - # Combination - if return_internals: - return tf.multiply(image, caption), list() - else: - return tf.multiply(image, caption) - - ### Test own network builder - - states = dict( - image=dict(shape=(64, 64, 3), type='float'), - caption=dict(shape=(20,), type='int') - ) - - agent = DQNAgent( - states=states, - actions=actions, - network=CustomNetwork, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ) - ) - - agent.close() - - ### Code block: LSTM function - from tensorforce.core.networks import Layer - - class Lstm(Layer): - - def __init__(self, size, scope='lstm', summary_labels=()): - self.size = size - super(Lstm, self).__init__(scope=scope, summary_labels=summary_labels) - - def tf_apply(self, x, update, state): - state = tf.contrib.rnn.LSTMStateTuple(c=state[:, 0, :], h=state[:, 1, :]) - self.lstm_cell = tf.contrib.rnn.LSTMCell(num_units=self.size) - x, state = self.lstm_cell(inputs=x, state=state) - state = tf.stack(values=(state.c, state.h), axis=1) - return x, dict(state=state) - - def internals_spec(self): - return dict(state=dict( - type='float', - shape=(2, self.size), - initialization='zeros' - )) - - ### Test LSTM - states = dict(shape=(10,), type='float') - network_spec = [ - {'type': 'flatten'}, - {'type': Lstm, 'size': 10} - ] - - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - update_mode=dict( - unit='timesteps', - batch_size=100, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ) - ) - - agent.close() - - ### Preprocessing configuration - states = dict(shape=(84, 84, 3), type='float') - states_preprocessing_spec = [ - dict( - type='image_resize', - width=84, - height=84 - ), - dict( - type='grayscale' - ), - dict( - type='normalize' - ) - # sequence preprocessing is temporarily broken - # , - # dict( - # type='sequence', - # length=4 - # ) - ] - - ### Test preprocessing configuration - - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - target_sync_frequency=50, - states_preprocessing=states_preprocessing_spec - ) - - agent.close() - - ### Code block: Continuous action exploration - - exploration = dict( - type='ornstein_uhlenbeck', - sigma=0.1, - mu=0.0, - theta=0.1 - ) - - ### Test continuous action exploration - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - actions_exploration=exploration - ) - - agent.close() - - ### Code block: Discrete action exploration - - exploration = dict( - type='epsilon_decay', - initial_epsilon=1.0, - final_epsilon=0.01, - timesteps=1e6 - ) - - ### Test discrete action exploration - agent = DQNAgent( - states=states, - actions=actions, - network=network_spec, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - actions_exploration=exploration - ) - - agent.close() - - def test_blogpost_introduction_runner(self): - from tensorforce.tests.minimal_test import MinimalTest - from tensorforce.agents import DQNAgent - from tensorforce.execution import Runner - - environment = MinimalTest(specification={'int': ()}) - - network_spec = [ - dict(type='dense', size=32) - ] - - agent = DQNAgent( - states=environment.states, - actions=environment.actions, - network=network_spec, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - target_sync_frequency=50 - ) - runner = Runner(agent=agent, environment=environment) - - def episode_finished(runner): - if runner.episode % 100 == 0: - print(sum(runner.episode_rewards[-100:]) / 100) - return runner.episode < 100 \ - or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:]) - - # runner.run(episodes=1000, episode_finished=episode_finished) - runner.run(episodes=10, episode_finished=episode_finished) # Only 10 episodes for this test - runner.close() - - ### Code block: next - agent = DQNAgent( - states=environment.states, - actions=environment.actions, - network=network_spec, - memory=dict( - type='replay', - include_next_states=True, - capacity=100 - ), - target_sync_frequency=50 - ) - - # max_episodes = 1000 - max_episodes = 10 # Only 10 episodes for this test - max_timesteps = 2000 - - episode = 0 - episode_rewards = list() - - while True: - state = environment.reset() - agent.reset() - - timestep = 0 - episode_reward = 0 - while True: - action = agent.act(states=state) - state, terminal, reward = environment.execute(actions=action) - agent.observe(terminal=terminal, reward=reward) - - timestep += 1 - episode_reward += reward - - if terminal or timestep == max_timesteps: - break - - episode += 1 - episode_rewards.append(episode_reward) - - if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes: - break - - agent.close() - environment.close() diff --git a/tensorforce/tests/test_vpg_agent.py b/tensorforce/tests/test_vpg_agent.py deleted file mode 100755 index 11cb42b46..000000000 --- a/tensorforce/tests/test_vpg_agent.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_agent_test import BaseAgentTest -from tensorforce.agents import VPGAgent - - -class TestVPGAgent(BaseAgentTest, unittest.TestCase): - - agent = VPGAgent - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - # multi_config = dict( - # batch_size=64, - # optimizer=dict( - # type='adam', - # learning_rate=0.01 - # ) - # ) diff --git a/tensorforce/tests/test_vpg_baselines.py b/tensorforce/tests/test_vpg_baselines.py deleted file mode 100755 index f9148bdd5..000000000 --- a/tensorforce/tests/test_vpg_baselines.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_test import BaseTest -from tensorforce.agents import VPGAgent -from tensorforce.core.networks import Dense, LayerBasedNetwork -from tensorforce.environments import MinimalTest - - -class TestVPGBaselines(BaseTest, unittest.TestCase): - - agent = VPGAgent - - def test_states_baseline(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - baseline_mode='states', - baseline=dict( - type='mlp', - sizes=[32, 32] - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ) - ) - self.base_test_pass( - name='states-baseline', - environment=environment, - network=network, - **config - ) - - def test_network_baseline(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - baseline_mode='network', - baseline=dict( - type='mlp', - sizes=[32, 32] - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ) - ) - self.base_test_pass( - name='network-baseline', - environment=environment, - network=network, - **config - ) - - def test_baseline_no_optimizer(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - baseline_mode='states', - baseline=dict( - type='mlp', - sizes=[32, 32] - ) - ) - self.base_test_pass( - name='baseline-no-optimizer', - environment=environment, - network=network, - **config - ) - - def test_gae_baseline(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - baseline_mode='states', - baseline=dict( - type='mlp', - sizes=[32, 32] - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ), - gae_lambda=0.95 - ) - self.base_test_pass( - name='gae-baseline', - environment=environment, - network=network, - **config - ) - - def test_multi_baseline(self): - - class CustomNetwork(LayerBasedNetwork): - - def __init__(self, scope='layerbased-network', summary_labels=()): - super(CustomNetwork, self).__init__(scope=scope, summary_labels=summary_labels) - - self.layer_bool1 = Dense(size=16, scope='state-bool1') - self.add_layer(layer=self.layer_bool1) - self.layer_bool2 = Dense(size=16, scope='state-bool2') - self.add_layer(layer=self.layer_bool2) - - self.layer_int1 = Dense(size=16, scope='state-int1') - self.add_layer(layer=self.layer_int1) - self.layer_int2 = Dense(size=16, scope='state-int2') - self.add_layer(layer=self.layer_int2) - - self.layer_float1 = Dense(size=16, scope='state-float1') - self.add_layer(layer=self.layer_float1) - self.layer_float2 = Dense(size=16, scope='state-float2') - self.add_layer(layer=self.layer_float2) - - self.layer_bounded1 = Dense(size=16, scope='state-bounded1') - self.add_layer(layer=self.layer_bounded1) - self.layer_bounded2 = Dense(size=16, scope='state-bounded2') - self.add_layer(layer=self.layer_bounded2) - - def tf_apply(self, x, internals, update, return_internals=False): - x0 = self.layer_bool2.apply(x=self.layer_bool1.apply(x=x['bool'], update=update), update=update) - x1 = self.layer_int2.apply(x=self.layer_int1.apply(x=x['int'], update=update), update=update) - x2 = self.layer_float2.apply(x=self.layer_float1.apply(x=x['float'], update=update), update=update) - x3 = self.layer_bounded2.apply(x=self.layer_bounded1.apply(x=x['bounded'], update=update), update=update) - x = x0 * x1 * x2 * x3 - return (x, dict()) if return_internals else x - - environment = MinimalTest( - specification={'bool': (), 'int': (2,), 'float': (1, 1), 'bounded': (1,)} - ) - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - baseline_mode='states', - baseline=dict( - type='aggregated', - baselines={ - 'bool': dict( - type='mlp', - sizes=[32, 32] - ), - 'int': dict( - type='mlp', - sizes=[32, 32] - ), - 'float': dict( - type='mlp', - sizes=[32, 32] - ), - 'bounded': dict( - type='mlp', - sizes=[32, 32] - ) - } - ), - baseline_optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - num_steps=5 - ) - ) - - self.base_test_pass( - name='multi-baseline', - environment=environment, - network=CustomNetwork, - **config - ) diff --git a/tensorforce/tests/test_vpg_memories.py b/tensorforce/tests/test_vpg_memories.py deleted file mode 100755 index 24149a9f5..000000000 --- a/tensorforce/tests/test_vpg_memories.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_test import BaseTest -from tensorforce.agents import VPGAgent -from tensorforce.environments import MinimalTest - - -class TestVPGMemories(BaseTest, unittest.TestCase): - - agent = VPGAgent - - def test_latest_timesteps(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='latest-timesteps', - environment=environment, - network=network, - **config - ) - - def test_latest_episodes(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='latest-episodes', - environment=environment, - network=network, - **config - ) - - def test_latest_sequences(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='sequences', - batch_size=8, - frequency=4, - length=2 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='latest-sequences', - environment=environment, - network=network, - **config - ) - - def test_replay_timesteps(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='replay-timesteps', - environment=environment, - network=network, - **config - ) - - def test_replay_episodes(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='replay', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='replay-episodes', - environment=environment, - network=network, - **config - ) - - def test_replay_sequences(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='sequences', - batch_size=8, - frequency=4, - length=2 - ), - memory=dict( - type='replay', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='replay-sequences', - environment=environment, - network=network, - **config - ) - - def test_prioritized_replay_timesteps(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - - config = dict( - update_mode=dict( - unit='timesteps', - batch_size=8, - frequency=4 - ), - memory=dict( - type='prioritized_replay', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - - self.base_test_run( - name='prioritized-replay-timesteps', - environment=environment, - network=network, - **config - ) diff --git a/tensorforce/tests/test_vpg_multithreaded.py b/tensorforce/tests/test_vpg_multithreaded.py deleted file mode 100755 index 8129a8f22..000000000 --- a/tensorforce/tests/test_vpg_multithreaded.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import copy -import logging -import sys -import unittest - -from tensorforce.agents import VPGAgent -from tensorforce.environments import MinimalTest -from tensorforce.execution import ThreadedRunner -from tensorforce.execution.threaded_runner import clone_worker_agent - - -logging.getLogger('tensorflow').disabled = True - - -class TestVPGMultithreaded(unittest.TestCase): - - def test_multithreaded(self): - sys.stdout.write('\nVPGAgent (multithreaded):') - sys.stdout.flush() - - environment = MinimalTest(specification={'int': ()}) - - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - kwargs = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - agent = VPGAgent( - states=environment.states, - actions=environment.actions, - network=network, - **kwargs - ) - - agents = clone_worker_agent(agent, 5, environment, network, kwargs) - environments = [environment] + [copy.deepcopy(environment) for n in range(4)] - - runner = ThreadedRunner(agent=agents, environment=environments) - - runner.run(episodes=100) - runner.close() - - sys.stdout.write(' ran\n') - sys.stdout.flush() diff --git a/tensorforce/tests/test_vpg_optimizers.py b/tensorforce/tests/test_vpg_optimizers.py deleted file mode 100755 index d5081a481..000000000 --- a/tensorforce/tests/test_vpg_optimizers.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import unittest - -from tensorforce.tests.base_test import BaseTest -from tensorforce.agents import VPGAgent -from tensorforce.environments import MinimalTest - - -class TestVPGOptimizers(BaseTest, unittest.TestCase): - - agent = VPGAgent - - # TODO: Tests for other TensorFlow optimizers, necessary? - - def test_adam(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - self.base_test_pass(name='adam', environment=environment, network=network, **config) - - def test_evolutionary(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='evolutionary', - learning_rate=1e-2 - ) - ) - self.base_test_pass(name='evolutionary', environment=environment, network=network, **config) - - def test_natural_gradient(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='natural_gradient', - learning_rate=1e-3 - ) - ) - self.base_test_pass(name='natural-gradient', environment=environment, network=network, **config) - - def test_clipped_step(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='clipped_step', - optimizer=dict( - type='adam', - learning_rate=1e-2 - ), - clipping_value=0.01 - ) - ) - self.base_test_pass(name='clipped-step', environment=environment, network=network, **config) - - def test_multi_step(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='multi_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ) - ) - ) - self.base_test_pass(name='multi-step', environment=environment, network=network, **config) - - def test_optimized_step(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='optimized_step', - optimizer=dict( - type='adam', - learning_rate=1e-2 - ) - ) - ) - self.base_test_pass(name='optimized-step', environment=environment, network=network, **config) - - def test_subsampling_step(self): - environment = MinimalTest(specification={'int': ()}) - network = [ - dict(type='dense', size=32), - dict(type='dense', size=32) - ] - config = dict( - update_mode=dict( - unit='episodes', - batch_size=4, - frequency=4 - ), - memory=dict( - type='latest', - include_next_states=False, - capacity=100 - ), - optimizer=dict( - type='subsampling_step', - optimizer=dict( - type='adam', - learning_rate=1e-3 - ), - fraction=0.33 - ) - ) - self.base_test_pass(name='multi-step', environment=environment, network=network, **config) diff --git a/tensorforce/util.py b/tensorforce/util.py index cec85cef8..f60bcca3a 100755 --- a/tensorforce/util.py +++ b/tensorforce/util.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,14 @@ # limitations under the License. # ============================================================================== +from datetime import datetime import importlib import logging + import numpy as np import tensorflow as tf -from tensorflow.core.util.event_pb2 import SessionLog -from tensorforce import TensorForceError +from tensorforce import TensorforceError epsilon = 1e-6 @@ -34,183 +35,187 @@ ) -def prod(xs): - """Computes the product along the elements in an iterable. Returns 1 for empty iterable. +class NullContext(object): - Args: - xs: Iterable containing numbers. + singleton = None - Returns: Product along iterable. + def __new__(cls): + if cls.singleton is None: + cls.singleton = super().__new__(cls) + return cls.singleton - """ - p = 1 - for x in xs: - p *= x - return p + def __enter__(self): + return self + def __exit__(self, etype, exception, traceback): + pass -def rank(x): - return x.get_shape().ndims + def __getattr__(self, name): + raise AttributeError + def __setattr__(self, name, value): + raise NotImplementedError -def shape(x, unknown=-1): - return tuple(unknown if dims is None else dims for dims in x.get_shape().as_list()) + def __delattr__(self, name): + raise NotImplementedError -def np_dtype(dtype): - """Translates dtype specifications in configurations to numpy data types. - Args: - dtype: String describing a numerical type (e.g. 'float') or numerical type primitive. +def debug(message): + logging.warning('{}: {}'.format(datetime.now().strftime('%H:%M:%S-%f')[:-3], message)) - Returns: Numpy data type - """ - if dtype == 'float' or dtype == float or dtype == np.float32 or dtype == tf.float32: - return np.float32 - elif dtype == 'int' or dtype == int or dtype == np.int32 or dtype == tf.int32: - return np.int32 - elif dtype == 'bool' or dtype == bool or dtype == np.bool_ or dtype == tf.bool: - return np.bool_ - else: - raise TensorForceError("Error: Type conversion from type {} not supported.".format(str(dtype))) +def overwrite_staticmethod(obj, function): + qualname = getattr(obj, function).__qualname__ + + def overwritten(*args, **kwargs): + raise TensorforceError(message="Function {}() is a static method.".format(qualname)) + setattr(obj, function, overwritten) -def tf_dtype(dtype): - """Translates dtype specifications in configurations to tensorflow data types. - Args: - dtype: String describing a numerical type (e.g. 'float'), numpy data type, - or numerical type primitive. +def try_import_module(*, module, parent_class=None): + try: + module = importlib.import_module(name=module) + assert parent_class is not None + classes = list() + for cls in dir(module): + cls = getattr(module, cls) + if isinstance(cls, type) and issubclass(cls, parent_class): + classes.append(cls) + if len(classes) > 1: + filter_classes = list() + for cls in classes: + if not all(issubclass(x, cls) for x in classes): # check whether not super-class + filter_classes.append(cls) + classes = filter_classes + if len(classes) == 0: + return None + elif len(classes) > 1: + raise TensorforceError(message="Ambiguous import modules: {}".format( + ', '.join(str(cls) for cls in classes) + )) + cls = classes[0] + if isinstance(parent_class, tuple): + assert all(cls != parent_cls for parent_cls in parent_class) + else: + assert cls != parent_class + return cls - Returns: TensorFlow data type + except ImportError: + pass - """ - if dtype == 'float' or dtype == float or dtype == np.float32 or dtype == tf.float32: - return tf.float32 - elif dtype == 'int' or dtype == int or dtype == np.int32 or dtype == tf.int32: - return tf.int32 - elif dtype == 'bool' or dtype == bool or dtype == np.bool_ or dtype == tf.bool: - return tf.bool - else: - raise TensorForceError("Error: Type conversion from type {} not supported.".format(str(dtype))) + if '.' not in module: + return None + module, class_name = module.rsplit('.', 1) + try: + module = importlib.import_module(name=module) + cls = getattr(module, class_name) + assert issubclass(cls, parent_class) and cls != parent_class + return cls -def map_tensors(fn, tensors): - if tensors is None: + except ImportError: return None - elif isinstance(tensors, tuple): - return tuple(map_tensors(fn=fn, tensors=tensor) for tensor in tensors) - elif isinstance(tensors, list): - return [map_tensors(fn=fn, tensors=tensor) for tensor in tensors] - elif isinstance(tensors, dict): - return {key: map_tensors(fn=fn, tensors=tensor) for key, tensor in tensors.items()} - elif isinstance(tensors, set): - return {map_tensors(fn=fn, tensors=tensor) for tensor in tensors} - else: - return fn(tensors) -def get_object(obj, predefined_objects=None, default_object=None, kwargs=None): - """ - Utility method to map some kind of object specification to its content, - e.g. optimizer or baseline specifications to the respective classes. +def is_iterable(x): + if isinstance(x, (str, dict, np.ndarray, tf.Tensor)): + return False + try: + iter(x) + return True + except TypeError: + return False + + +def is_equal(x, y): + if isinstance(x, tuple): + return isinstance(y, tuple) and all(is_equal(x=x, y=y) for x, y in zip(x, y)) + elif isinstance(x, (list, tuple)): + return isinstance(y, list) and all(is_equal(x=x, y=y) for x, y in zip(x, y)) + elif isinstance(x, dict): + return isinstance(y, dict) and len(x) == len(y) and \ + all(k in y and is_equal(x=v, y=y[k]) for k, v in x.items()) + elif isinstance(x, np.ndarray): + return isinstance(y, np.ndarray) and (x == y).all() + else: + return x == y - Args: - obj: A specification dict (value for key 'type' optionally specifies - the object, options as follows), a module path (e.g., - my_module.MyClass), a key in predefined_objects, or a callable - (e.g., the class type object). - predefined_objects: Dict containing predefined set of objects, - accessible via their key - default_object: Default object is no other is specified - kwargs: Arguments for object creation - Returns: The retrieved object +def unary_tuple(x, depth): + assert depth > 0 + for _ in range(depth): + x = (x,) + return x - """ - args = () - kwargs = dict() if kwargs is None else kwargs - - if isinstance(obj, dict): - kwargs.update(obj) - obj = kwargs.pop('type', None) - - if predefined_objects is not None and obj in predefined_objects: - obj = predefined_objects[obj] - elif isinstance(obj, str): - if obj.find('.') != -1: - module_name, function_name = obj.rsplit('.', 1) - module = importlib.import_module(module_name) - obj = getattr(module, function_name) + +def product(xs, empty=1): + result = None + for x in xs: + if result is None: + result = x else: - raise TensorForceError("Error: object {} not found in predefined objects: {}".format( - obj, - list(predefined_objects or ()) - )) - elif callable(obj): - pass - elif default_object is not None: - args = (obj,) - obj = default_object + result *= x + + if result is None: + result = empty + + return result + + +def deep_disjoint_update(target, source): # , ignore=() + for key, value in source.items(): + if key not in target: + target[key] = value + # elif key in ignore: + # continue + elif isinstance(target[key], dict): + if not isinstance(value, dict): + raise TensorforceError.mismatch( + name='spec', argument=key, value1=target[key], value2=value + ) + deep_disjoint_update(target=target[key], source=value) + elif is_iterable(x=target[key]): + if not is_iterable(x=value) or len(target[key]) != len(value): + raise TensorforceError.mismatch( + name='spec', argument=key, value1=target[key], value2=value + ) + for x, y in zip(target[key], value): + if x != y: + raise TensorforceError.mismatch( + name='spec', argument=key, value1=target[key], value2=value + ) + elif target[key] != value: + raise TensorforceError.mismatch( + name='spec', argument=key, value1=target[key], value2=value + ) + + +def py_dtype(dtype): + if dtype == 'float': # or dtype == float or dtype == np.float32 or dtype == tf.float32: + return float + elif dtype == 'int' or dtype == 'long': + # dtype == int or dtype == np.int32 or dtype == tf.int32 or + # or dtype == np.int64 or dtype == tf.int64 + return int + elif dtype == 'bool': # or dtype == bool or dtype == np.bool8 or dtype == tf.bool: + return bool else: - # assumes the object is already instantiated - return obj + raise TensorforceError.value(name='util.py_dtype', argument='dtype', value=dtype) - return obj(*args, **kwargs) -def prepare_kwargs(raw, string_parameter='name'): - """ - Utility method to convert raw string/diction input into a dictionary to pass - into a function. Always returns a dictionary. +np_dtype_mapping = dict(bool=np.bool8, int=np.int64, long=np.int64, float=np.float32) + +def np_dtype(dtype): + """Translates dtype specifications in configurations to numpy data types. Args: - raw: string or dictionary, string is assumed to be the name of the activation - activation function. Dictionary will be passed through unchanged. + dtype: String describing a numerical type (e.g. 'float') or numerical type primitive. - Returns: kwargs dictionary for **kwargs + Returns: Numpy data type """ - kwargs = dict() - - if isinstance(raw, dict): - kwargs.update(raw) - elif isinstance(raw, str): - kwargs[string_parameter] = raw - - return kwargs - -class UpdateSummarySaverHook(tf.train.SummarySaverHook): - - def __init__(self, model, *args, **kwargs): - super(UpdateSummarySaverHook, self).__init__(*args, **kwargs) - self.model = model - - def before_run(self, run_context): - self._request_summary = run_context.original_args[1] is not None and \ - self.model.is_observe and \ - (self._next_step is None or self._timer.should_trigger_for_step(self._next_step)) - # run_context.original_args[1].get(self.is_optimizing, False) and \ - requests = {'global_step': self._global_step_tensor} - if self._request_summary: - if self._get_summary_op() is not None: - requests['summary'] = self._get_summary_op() - return tf.train.SessionRunArgs(requests) - - def after_run(self, run_context, run_values): - if not self._summary_writer: - return - - stale_global_step = run_values.results['global_step'] - global_step = stale_global_step + 1 - if self._next_step is None or self._request_summary: - global_step = run_context.session.run(self._global_step_tensor) - - if self._next_step is None: - self._summary_writer.add_session_log(SessionLog(status=SessionLog.START), global_step) - - if 'summary' in run_values.results: - self._timer.update_last_triggered_step(global_step) - for summary in run_values.results['summary']: - self._summary_writer.add_summary(summary, global_step) - - self._next_step = global_step + 1 + if dtype in np_dtype_mapping: + return np_dtype_mapping[dtype] + else: + raise TensorforceError.value(name='util.np_dtype', argument='dtype', value=dtype) diff --git a/tensorforce/tests/__init__.py b/test/__init__.py similarity index 85% rename from tensorforce/tests/__init__.py rename to test/__init__.py index c742eb977..f123484b1 100644 --- a/tensorforce/tests/__init__.py +++ b/test/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017 reinforce.io. All Rights Reserved. +# Copyright 2020 Tensorforce Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - - -# Pass thresholds for all tests -reward_threshold = 0.8 diff --git a/test/data/Breakout.bin b/test/data/Breakout.bin new file mode 100644 index 000000000..abab5a8c0 Binary files /dev/null and b/test/data/Breakout.bin differ diff --git a/test/data/agent.json b/test/data/agent.json new file mode 100644 index 000000000..02d7020ae --- /dev/null +++ b/test/data/agent.json @@ -0,0 +1,12 @@ +{ + "agent": "tensorforce", + "update": 4, + "optimizer": { + "optimizer": "adam", + "learning_rate": 1e-3 + }, + "objective": "policy_gradient", + "reward_estimation": { + "horizon": 2 + } +} diff --git a/test/data/basic.cfg b/test/data/basic.cfg new file mode 100644 index 000000000..d93bce96e --- /dev/null +++ b/test/data/basic.cfg @@ -0,0 +1,39 @@ +# Lines starting with # are treated as comments (or with whitespaces+#). +# It doesn't matter if you use capital letters or not. +# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. + +doom_scenario_path = basic.wad +doom_map = map01 + +# Rewards +living_reward = -1 + +# Rendering options +screen_resolution = RES_320X240 +screen_format = CRCGCB +render_hud = True +render_crosshair = false +render_weapon = true +render_decals = false +render_particles = false +window_visible = true + +# make episodes start after 20 tics (after unholstering the gun) +episode_start_time = 14 + +# make episodes finish after 300 actions (tics) +episode_timeout = 300 + +# Available buttons +available_buttons = + { + MOVE_LEFT + MOVE_RIGHT + ATTACK + } + +# Game variables that will be in the state +available_game_variables = { AMMO2} + +mode = PLAYER +doom_skill = 5 diff --git a/test/data/basic.wad b/test/data/basic.wad new file mode 100644 index 000000000..51e937e25 Binary files /dev/null and b/test/data/basic.wad differ diff --git a/test/data/checkpoint b/test/data/checkpoint new file mode 100644 index 000000000..6c64444b1 --- /dev/null +++ b/test/data/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "ppo-checkpoint-1" +all_model_checkpoint_paths: "ppo-checkpoint-1" diff --git a/test/data/custom_env.py b/test/data/custom_env.py new file mode 100644 index 000000000..6926dc8dd --- /dev/null +++ b/test/data/custom_env.py @@ -0,0 +1,33 @@ +import numpy as np + +from tensorforce import Environment + + +class CustomEnvironment(Environment): + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='float', shape=(8,)) + + def actions(self): + return dict(type='int', num_values=4) + + # Optional, should only be defined if environment has a natural maximum duration + def max_episode_timesteps(self): + return super().max_episode_timesteps() + + # Optional + def close(self): + super().close() + + def reset(self): + state = np.random.random(size=(8,)) + return state + + def execute(self, actions): + next_state = np.random.random(size=(8,)) + terminal = False # Always False if no "natural" terminal state + reward = np.random.random() + return next_state, terminal, reward diff --git a/test/data/environment.json b/test/data/environment.json new file mode 100644 index 000000000..3ea8d6d6f --- /dev/null +++ b/test/data/environment.json @@ -0,0 +1,6 @@ +{ + "environment": "gym", + "level": "CartPole", + "min_value": -3.0, + "max_value": 3.0 +} diff --git a/test/data/memory.json b/test/data/memory.json new file mode 100644 index 000000000..05d7621b3 --- /dev/null +++ b/test/data/memory.json @@ -0,0 +1,3 @@ +{ + "type": "replay" +} diff --git a/test/data/network.json b/test/data/network.json new file mode 100644 index 000000000..7c6a61fbd --- /dev/null +++ b/test/data/network.json @@ -0,0 +1,13 @@ +{ + "type": "layered", + "layers": [ + { + "type": "dense", + "size": 32 + }, + { + "type": "dense", + "size": 32 + } + ] +} diff --git a/test/data/ppo-checkpoint-1.data-00000-of-00001 b/test/data/ppo-checkpoint-1.data-00000-of-00001 new file mode 100644 index 000000000..d25c89e48 Binary files /dev/null and b/test/data/ppo-checkpoint-1.data-00000-of-00001 differ diff --git a/test/data/ppo-checkpoint-1.index b/test/data/ppo-checkpoint-1.index new file mode 100644 index 000000000..87d449122 Binary files /dev/null and b/test/data/ppo-checkpoint-1.index differ diff --git a/test/data/ppo-checkpoint.hdf5 b/test/data/ppo-checkpoint.hdf5 new file mode 100644 index 000000000..514fe0aa0 Binary files /dev/null and b/test/data/ppo-checkpoint.hdf5 differ diff --git a/test/data/ppo-checkpoint.json b/test/data/ppo-checkpoint.json new file mode 100644 index 000000000..415554651 --- /dev/null +++ b/test/data/ppo-checkpoint.json @@ -0,0 +1 @@ +{"agent": "ppo", "states": {"type": "float", "shape": [4], "min_value": [-4.800000190734863, -3.0, -0.41887903213500977, -3.0], "max_value": [4.800000190734863, 3.0, 0.41887903213500977, 3.0]}, "actions": {"type": "int", "shape": [], "num_values": 2}, "max_episode_timesteps": 500, "batch_size": 12, "network": {"type": "auto", "rnn": false}, "use_beta_distribution": false, "memory": "minimum", "update_frequency": 1, "learning_rate": 0.001813150053725916, "multi_step": 5, "subsampling_fraction": 0.9131375430837279, "likelihood_ratio_clipping": 0.09955676846552193, "discount": 0.9985351346308641, "predict_terminal_values": false, "reward_processing": null, "baseline": {"type": "auto", "rnn": false}, "baseline_optimizer": {"optimizer": "adam", "learning_rate": 0.003670157218888348, "multi_step": 10}, "state_preprocessing": "linear_normalization", "exploration": 0.0, "variable_noise": 0.0, "l2_regularization": 0.0, "entropy_regularization": 0.0011393096635237982, "parallel_interactions": 1, "config": {"device": "CPU"}, "saver": null, "summarizer": null, "recorder": {"directory": "test/data/ppo-traces", "start": 80}, "internals": {}, "initial_internals": {"policy": {}, "baseline": {}}} diff --git a/test/data/ppo-checkpoint.npz b/test/data/ppo-checkpoint.npz new file mode 100644 index 000000000..5fd3dcf40 Binary files /dev/null and b/test/data/ppo-checkpoint.npz differ diff --git a/test/data/ppo-checkpoint/saved_model.pb b/test/data/ppo-checkpoint/saved_model.pb new file mode 100644 index 000000000..afa84f404 Binary files /dev/null and b/test/data/ppo-checkpoint/saved_model.pb differ diff --git a/test/data/ppo-checkpoint/variables/variables.data-00000-of-00001 b/test/data/ppo-checkpoint/variables/variables.data-00000-of-00001 new file mode 100644 index 000000000..e0580d5ac Binary files /dev/null and b/test/data/ppo-checkpoint/variables/variables.data-00000-of-00001 differ diff --git a/test/data/ppo-checkpoint/variables/variables.index b/test/data/ppo-checkpoint/variables/variables.index new file mode 100644 index 000000000..2a033a727 Binary files /dev/null and b/test/data/ppo-checkpoint/variables/variables.index differ diff --git a/test/data/ppo-traces/trace-000000080.npz b/test/data/ppo-traces/trace-000000080.npz new file mode 100644 index 000000000..3f5b64c8c Binary files /dev/null and b/test/data/ppo-traces/trace-000000080.npz differ diff --git a/test/data/ppo-traces/trace-000000081.npz b/test/data/ppo-traces/trace-000000081.npz new file mode 100644 index 000000000..a2673d2ac Binary files /dev/null and b/test/data/ppo-traces/trace-000000081.npz differ diff --git a/test/data/ppo-traces/trace-000000082.npz b/test/data/ppo-traces/trace-000000082.npz new file mode 100644 index 000000000..e33c9f035 Binary files /dev/null and b/test/data/ppo-traces/trace-000000082.npz differ diff --git a/test/data/ppo-traces/trace-000000083.npz b/test/data/ppo-traces/trace-000000083.npz new file mode 100644 index 000000000..7cf903a95 Binary files /dev/null and b/test/data/ppo-traces/trace-000000083.npz differ diff --git a/test/data/ppo-traces/trace-000000084.npz b/test/data/ppo-traces/trace-000000084.npz new file mode 100644 index 000000000..b264196b1 Binary files /dev/null and b/test/data/ppo-traces/trace-000000084.npz differ diff --git a/test/data/ppo-traces/trace-000000085.npz b/test/data/ppo-traces/trace-000000085.npz new file mode 100644 index 000000000..bdbeda361 Binary files /dev/null and b/test/data/ppo-traces/trace-000000085.npz differ diff --git a/test/data/ppo-traces/trace-000000086.npz b/test/data/ppo-traces/trace-000000086.npz new file mode 100644 index 000000000..5e1103e9a Binary files /dev/null and b/test/data/ppo-traces/trace-000000086.npz differ diff --git a/test/data/ppo-traces/trace-000000087.npz b/test/data/ppo-traces/trace-000000087.npz new file mode 100644 index 000000000..235fa09e5 Binary files /dev/null and b/test/data/ppo-traces/trace-000000087.npz differ diff --git a/test/data/ppo-traces/trace-000000088.npz b/test/data/ppo-traces/trace-000000088.npz new file mode 100644 index 000000000..885090ddf Binary files /dev/null and b/test/data/ppo-traces/trace-000000088.npz differ diff --git a/test/data/ppo-traces/trace-000000089.npz b/test/data/ppo-traces/trace-000000089.npz new file mode 100644 index 000000000..80fa87c44 Binary files /dev/null and b/test/data/ppo-traces/trace-000000089.npz differ diff --git a/test/data/ppo-traces/trace-000000090.npz b/test/data/ppo-traces/trace-000000090.npz new file mode 100644 index 000000000..1feaeed15 Binary files /dev/null and b/test/data/ppo-traces/trace-000000090.npz differ diff --git a/test/data/ppo-traces/trace-000000091.npz b/test/data/ppo-traces/trace-000000091.npz new file mode 100644 index 000000000..77ebe3bdb Binary files /dev/null and b/test/data/ppo-traces/trace-000000091.npz differ diff --git a/test/data/ppo-traces/trace-000000092.npz b/test/data/ppo-traces/trace-000000092.npz new file mode 100644 index 000000000..a7520136a Binary files /dev/null and b/test/data/ppo-traces/trace-000000092.npz differ diff --git a/test/data/ppo-traces/trace-000000093.npz b/test/data/ppo-traces/trace-000000093.npz new file mode 100644 index 000000000..352e8da2f Binary files /dev/null and b/test/data/ppo-traces/trace-000000093.npz differ diff --git a/test/data/ppo-traces/trace-000000094.npz b/test/data/ppo-traces/trace-000000094.npz new file mode 100644 index 000000000..b4a5ad92a Binary files /dev/null and b/test/data/ppo-traces/trace-000000094.npz differ diff --git a/test/data/ppo-traces/trace-000000095.npz b/test/data/ppo-traces/trace-000000095.npz new file mode 100644 index 000000000..ea5a49adb Binary files /dev/null and b/test/data/ppo-traces/trace-000000095.npz differ diff --git a/test/data/ppo-traces/trace-000000096.npz b/test/data/ppo-traces/trace-000000096.npz new file mode 100644 index 000000000..13a87d02f Binary files /dev/null and b/test/data/ppo-traces/trace-000000096.npz differ diff --git a/test/data/ppo-traces/trace-000000097.npz b/test/data/ppo-traces/trace-000000097.npz new file mode 100644 index 000000000..590f287e7 Binary files /dev/null and b/test/data/ppo-traces/trace-000000097.npz differ diff --git a/test/data/ppo-traces/trace-000000098.npz b/test/data/ppo-traces/trace-000000098.npz new file mode 100644 index 000000000..c22664330 Binary files /dev/null and b/test/data/ppo-traces/trace-000000098.npz differ diff --git a/test/data/ppo-traces/trace-000000099.npz b/test/data/ppo-traces/trace-000000099.npz new file mode 100644 index 000000000..7e59b92a9 Binary files /dev/null and b/test/data/ppo-traces/trace-000000099.npz differ diff --git a/test/data/ppo_checkpoint.py b/test/data/ppo_checkpoint.py new file mode 100644 index 000000000..615f71ae3 --- /dev/null +++ b/test/data/ppo_checkpoint.py @@ -0,0 +1,33 @@ +import os + +from tensorforce import Runner + + +os.remove('test/data/checkpoint') +os.remove('test/data/ppo-checkpoint-1.data-00000-of-00001') +os.remove('test/data/ppo-checkpoint-1.index') +os.remove('test/data/ppo-checkpoint.json') +os.remove('test/data/ppo-checkpoint.npz') +os.remove('test/data/ppo-checkpoint.hdf5') + +os.rmdir('test/data/ppo-checkpoint/assets') +os.remove('test/data/ppo-checkpoint/variables/variables.data-00000-of-00001') +os.remove('test/data/ppo-checkpoint/variables/variables.index') +os.rmdir('test/data/ppo-checkpoint/variables') +os.remove('test/data/ppo-checkpoint/saved_model.pb') +os.rmdir('test/data/ppo-checkpoint') + + +runner = Runner( + agent=dict( + agent='benchmarks/configs/ppo.json', + config=dict(device='CPU'), + recorder=dict(directory='test/data/ppo-traces', start=80) + ), environment='benchmarks/configs/cartpole.json' +) +runner.run(num_episodes=100) +runner.agent.save(directory='test/data', filename='ppo-checkpoint', format='checkpoint') +runner.agent.save(directory='test/data', filename='ppo-checkpoint', format='numpy') +runner.agent.save(directory='test/data', filename='ppo-checkpoint', format='hdf5') +runner.agent.save(directory='test/data', filename='ppo-checkpoint', format='saved-model') +runner.close() diff --git a/test/test_agents.py b/test/test_agents.py new file mode 100644 index 000000000..83d9e6820 --- /dev/null +++ b/test/test_agents.py @@ -0,0 +1,226 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from tempfile import TemporaryDirectory +import unittest + +from tensorforce import Agent +from test.unittest_base import UnittestBase + + +class TestAgents(UnittestBase, unittest.TestCase): + + agent = dict( + config=dict(device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20) + ) + + def test_a2c(self): + self.start_tests(name='A2C') + # TODO: baseline horizon has to be equal to policy horizon + agent, environment = self.prepare( + agent='a2c', batch_size=4, network=dict(type='auto', size=8, depth=1, rnn=2), + critic=dict(type='auto', size=7, depth=1, rnn=2) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_ac(self): + self.start_tests(name='AC') + # TODO: baseline horizon has to be equal to policy horizon + agent, environment = self.prepare( + agent='ac', batch_size=4, network=dict(type='auto', size=8, depth=1, rnn=2), + critic=dict(type='auto', size=7, depth=1, rnn=2) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_constant(self): + self.start_tests(name='Constant') + self.unittest(num_episodes=2, experience_update=False, agent='constant') + + def test_dpg(self): + self.start_tests(name='DPG') + actions = dict( + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + agent, environment = self.prepare( + actions=actions, agent='dpg', memory=100, batch_size=4, + # TODO: no-RNN restriction can be removed + network=dict(type='auto', size=8, depth=1, rnn=False), + # TODO: cannot use RNN since value function takes states and actions + critic=dict(type='auto', size=7, depth=1, rnn=False) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_double_dqn(self): + self.start_tests(name='DoubleDQN') + agent, environment = self.prepare( + actions=dict(type='int', shape=(2,), num_values=4), + agent='double_dqn', memory=100, batch_size=4, + network=dict(type='auto', size=8, depth=1, rnn=2) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_dqn(self): + self.start_tests(name='DQN') + agent, environment = self.prepare( + actions=dict(type='int', shape=(2,), num_values=4), + agent='dqn', memory=100, batch_size=4, + network=dict(type='auto', size=8, depth=1, rnn=2) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_dueling_dqn(self): + self.start_tests(name='DuelingDQN') + agent, environment = self.prepare( + actions=dict(type='int', shape=(2,), num_values=4), + agent='dueling_dqn', memory=100, batch_size=4, + network=dict(type='auto', size=8, depth=1, rnn=2) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_ppo(self): + self.start_tests(name='PPO') + agent, environment = self.prepare( + agent='ppo', batch_size=2, network=dict(type='auto', size=8, depth=1, rnn=2), + baseline=dict(type='auto', size=7, depth=1, rnn=1), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_random(self): + self.start_tests(name='Random') + self.unittest(num_episodes=2, experience_update=False, agent='random') + + def test_tensorforce(self): + self.start_tests(name='Tensorforce') + + # Explicit, singleton state/action + self.unittest( + states=dict(type='float', shape=(), min_value=1.0, max_value=2.0), + actions=dict(type='int', shape=(), num_values=4), + agent='tensorforce', **UnittestBase.agent + ) + + # Implicit + agent, environment = self.prepare(**UnittestBase.agent) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_trpo(self): + self.start_tests(name='TRPO') + agent, environment = self.prepare( + agent='trpo', batch_size=2, network=dict(type='auto', size=8, depth=1, rnn=2), + baseline=dict(type='auto', size=7, depth=1, rnn=1), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() + + def test_vpg(self): + self.start_tests(name='VPG') + agent, environment = self.prepare( + agent='vpg', batch_size=2, network=dict(type='auto', size=8, depth=1, rnn=2), + baseline=dict(type='auto', size=7, depth=1, rnn=1), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3) + ) + + self.execute(agent=agent, environment=environment) + + with TemporaryDirectory() as directory: + agent.save(directory=directory, format='numpy') + agent = Agent.load(directory=directory) + states = environment.reset() + agent.act(states=states) + agent.close() + environment.close() diff --git a/test/test_documentation.py b/test/test_documentation.py new file mode 100644 index 000000000..64b5d6a0f --- /dev/null +++ b/test/test_documentation.py @@ -0,0 +1,370 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from tensorforce import Agent, Environment, Runner + +from test.unittest_base import UnittestBase + + +class TestDocumentation(UnittestBase, unittest.TestCase): + + def test_environment(self): + self.start_tests(name='getting-started-environment') + + environment = Environment.create( + environment='gym', level='CartPole', max_episode_timesteps=50 + ) + self.finished_test() + + environment = Environment.create(environment='gym', level='CartPole-v1') + self.finished_test() + + environment = Environment.create( + environment='test/data/environment.json', max_episode_timesteps=50 + ) + self.finished_test() + + environment = Environment.create( + environment='test.data.custom_env', max_episode_timesteps=10 + ) + self.finished_test() + + from test.data.custom_env import CustomEnvironment + environment = Environment.create( + environment=CustomEnvironment, max_episode_timesteps=10 + ) + self.finished_test() + + def test_agent(self): + self.start_tests(name='getting-started-agent') + + environment = Environment.create( + environment='gym', level='CartPole', max_episode_timesteps=50 + ) + self.finished_test() + + agent = Agent.create( + agent='tensorforce', environment=environment, update=64, + optimizer=dict(optimizer='adam', learning_rate=1e-3), + objective='policy_gradient', reward_estimation=dict(horizon=20) + ) + self.finished_test() + + agent = Agent.create( + agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3 + ) + self.finished_test() + + agent = Agent.create(agent='test/data/agent.json', environment=environment) + self.finished_test() + + def test_execution(self): + self.start_tests(name='getting-started-execution') + + runner = Runner( + agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), + max_episode_timesteps=10 + ) + runner.run(num_episodes=10) + runner.run(num_episodes=5, evaluation=True) + runner.close() + self.finished_test() + + runner = Runner( + agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), + max_episode_timesteps=50, num_parallel=5, remote='multiprocessing' + ) + runner.run(num_episodes=10) + runner.close() + self.finished_test() + + # Create agent and environment + environment = Environment.create( + environment='test/data/environment.json', max_episode_timesteps=10 + ) + agent = Agent.create(agent='test/data/agent.json', environment=environment) + + # Train for 100 episodes + for _ in range(10): + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + # Train for 100 episodes + for _ in range(10): + episode_states = list() + episode_internals = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + episode_states.append(states) + episode_internals.append(internals) + actions, internals = agent.act(states=states, internals=internals, independent=True) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + + agent.experience( + states=episode_states, internals=episode_internals, actions=episode_actions, + terminal=episode_terminal, reward=episode_reward + ) + agent.update() + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(10): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, + deterministic=True, independent=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + + print('Mean episode reward:', sum_rewards / 100) + + # Close agent and environment + agent.close() + environment.close() + + self.finished_test() + + def test_readme(self): + self.start_tests(name='readme') + + # ==================== + + from tensorforce import Agent, Environment + + # Pre-defined or custom environment + environment = Environment.create( + environment='gym', level='CartPole', max_episode_timesteps=500 + ) + + # Instantiate a Tensorforce agent + agent = Agent.create( + agent='tensorforce', + environment=environment, # alternatively: states, actions, (max_episode_timesteps) + memory=1000, + update=dict(unit='timesteps', batch_size=64), + optimizer=dict(type='adam', learning_rate=3e-4), + policy=dict(network='auto'), + objective='policy_gradient', + reward_estimation=dict(horizon=20) + ) + + # Train for 300 episodes + for _ in range(1): + + # Initialize episode + states = environment.reset() + terminal = False + + while not terminal: + # Episode timestep + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + agent.close() + environment.close() + + # ==================== + + self.finished_test() + + def test_modules(self): + self.start_tests(name='modules') + + # distributions + self.unittest( + policy=dict(distributions=dict( + float=dict(type='gaussian', stddev_mode='global'), + bounded_action=dict(type='beta') + )) + ) + + # layers + import tensorflow as tf + self.unittest( + states=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + policy=dict(network=[ + (lambda x: tf.clip_by_value(x, -1.0, 1.0)), + dict(type='dense', size=8, activation='tanh') + ]) + ) + + # memories + self.unittest( + memory=100 + ) + + # networks + self.unittest( + states=dict(type='float', shape=(2,), min_value=1.0, max_value=2.0), + policy=dict(network=[ + dict(type='dense', size=8, activation='tanh'), + dict(type='dense', size=8, activation='tanh') + ]) + ) + self.unittest( + states=dict( + observation=dict(type='float', shape=(4, 4, 3), min_value=-1.0, max_value=1.0), + attributes=dict(type='int', shape=(4, 2), num_values=5) + ), + policy=[ + [ + dict(type='retrieve', tensors=['observation']), + dict(type='conv2d', size=8), + dict(type='flatten'), + dict(type='register', tensor='obs-embedding') + ], + [ + dict(type='retrieve', tensors=['attributes']), + dict(type='embedding', size=8), + dict(type='flatten'), + dict(type='register', tensor='attr-embedding') + ], + [ + dict( + type='retrieve', tensors=['obs-embedding', 'attr-embedding'], + aggregation='concat' + ), + dict(type='dense', size=16) + ] + ] + ) + + # optimizers + self.unittest( + optimizer=dict( + optimizer='adam', learning_rate=1e-3, clipping_threshold=1e-2, + multi_step=3, subsampling_fraction=8, linesearch_iterations=3, + doublecheck_update=True + ) + ) + + # parameters + self.unittest( + exploration=0.1 + ) + self.unittest( + optimizer=dict(optimizer='adam', learning_rate=dict( + type='exponential', unit='timesteps', num_steps=2, + initial_value=0.01, decay_rate=0.5 + )) + ) + self.unittest( + reward_estimation=dict(horizon=dict( + type='linear', unit='episodes', num_steps=2, + initial_value=2, final_value=6 + )) + ) + + # preprocessing + reward_processing = dict(type='clipping', lower=-1.0, upper=1.0) + self.unittest( + states=dict(type='float', shape=(8, 8, 3), min_value=-1.0, max_value=2.0), + reward_estimation=dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + reward_processing=reward_processing, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), + state_preprocessing=[ + dict(type='image', height=4, width=4, grayscale=True), + dict(type='exponential_normalization', decay=0.999) + ] + ) + + # policy + self.unittest( + states=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + policy=[ + dict(type='dense', size=8, activation='tanh'), + dict(type='dense', size=8, activation='tanh') + ] + ) + self.unittest( + states=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + policy=dict(network='auto') + ) + self.unittest( + states=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + policy=dict( + type='parametrized_distributions', + network=[ + dict(type='dense', size=8, activation='tanh'), + dict(type='dense', size=8, activation='tanh') + ], + distributions=dict( + float=dict(type='gaussian', stddev_mode='global'), + bounded_action=dict(type='beta') + ), + temperature=dict( + type='decaying', decay='exponential', unit='episodes', + num_steps=2, initial_value=0.01, decay_rate=0.5 + ) + ) + ) + self.unittest( + states=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + actions=dict( + action1=dict(type='int', shape=(), num_values=5), + action2=dict(type='float', shape=(), min_value=-1.0, max_value=1.0) + ), + policy=dict( + type='parametrized_distributions', + network=[ + dict(type='dense', size=64), + dict(type='register', tensor='action1-embedding'), + dict(type='dense', size=64) + # Final output implicitly used for remaining actions + ], + single_output=False + ) + ) + + def test_masking(self): + self.start_tests(name='masking') + + agent, environment = self.prepare( + states=dict(type='float', shape=(10,), min_value=-1.0, max_value=2.0), + actions=dict(type='int', shape=(), num_values=3) + ) + states = environment.reset() + assert 'state' in states and 'action_mask' in states + states['action_mask'] = [True, False, True] + + action = agent.act(states=states) + assert action != 1 + + agent.close() + environment.close() + self.finished_test() diff --git a/test/test_environments.py b/test/test_environments.py new file mode 100644 index 000000000..c66bd853b --- /dev/null +++ b/test/test_environments.py @@ -0,0 +1,121 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import pytest +import unittest + +from test.unittest_base import UnittestBase + + +class TestEnvironments(UnittestBase, unittest.TestCase): + + agent = dict(agent='random') + experience_update = False + + def test_ale(self): + self.start_tests(name='ale') + self.unittest( + environment=dict(environment='ale', level='test/data/Breakout.bin'), num_episodes=2 + ) + + @pytest.mark.skip(reason='not installed as part of travis') + def test_open_sim(self): + self.start_tests(name='open-sim') + self.unittest(environment=dict(environment='osim', level='Arm2D'), num_episodes=2) + self.unittest(environment=dict(environment='osim', level='L2M2019'), num_episodes=2) + self.unittest(environment=dict(environment='osim', level='LegacyArm'), num_episodes=2) + self.unittest(environment=dict(environment='osim', level='LegacyRun'), num_episodes=2) + + def test_openai_gym(self): + self.start_tests(name='openai-gym') + + # state: box, action: discrete + self.unittest(environment=dict(environment='gym', level='CartPole-v0'), num_episodes=2) + + # state: discrete, action: box + # self.unittest(environment=dict(environment='gym', level='GuessingGame'), num_episodes=2) + + # state: discrete, action: tuple(discrete) + # from gym.envs.algorithmic import ReverseEnv + # self.unittest(environment=ReverseEnv, num_episodes=2) + + # state: discrete, action: discrete + from gym.envs.toy_text import FrozenLakeEnv + self.unittest(environment=FrozenLakeEnv, num_episodes=2) + + # state: tuple, action: discrete + from gym.envs.toy_text import BlackjackEnv + self.unittest(environment=BlackjackEnv(), num_episodes=2) + + # Classic control + self.unittest(environment='CartPole-v1', num_episodes=2) + self.unittest(environment='MountainCar-v0', num_episodes=2) + self.unittest(environment='MountainCarContinuous-v0', num_episodes=2) + self.unittest(environment='Pendulum-v1', num_episodes=2) + self.unittest(environment='Acrobot-v1', num_episodes=2) + + # Box2d + self.unittest(environment='LunarLander-v2', num_episodes=2) + self.unittest(environment='LunarLanderContinuous-v2', num_episodes=2) + self.unittest(environment='BipedalWalker-v3', num_episodes=2) + self.unittest(environment='BipedalWalkerHardcore-v3', num_episodes=2) + # below: self.unittest(environment='CarRacing-v0', num_episodes=2) + + # Toy text + # above: self.unittest(environment='Blackjack-v1', num_episodes=2) + self.unittest(environment='FrozenLake-v1', num_episodes=2) + self.unittest(environment='FrozenLake8x8-v1', num_episodes=2) + self.unittest(environment='CliffWalking-v0', num_episodes=2) + self.unittest(environment='Taxi-v3', num_episodes=2) + + # Unit test + self.unittest(environment='CubeCrash-v0', num_episodes=2) + self.unittest(environment='CubeCrashSparse-v0', num_episodes=2) + self.unittest(environment='CubeCrashScreenBecomesBlack-v0', num_episodes=2) + self.unittest(environment='MemorizeDigits-v0', num_episodes=2) + + @pytest.mark.skip(reason='requires virtual frame buffer xvfb-run') + def test_openai_gym2(self): + # state: box, action: box with non-uniform bounds + # xvfb-run -s "-screen 0 1400x900x24" python -m unittest ... + self.unittest(environment='CarRacing-v0', num_episodes=2) + + def test_openai_retro(self): + self.start_tests(name='openai-retro') + self.unittest( + environment=dict(environment='retro', level='Airstriker-Genesis'), num_episodes=2 + ) + + @pytest.mark.skip(reason='not installed as part of travis') + def test_ple(self): + self.start_tests(name='pygame-learning-environment') + self.unittest(environment=dict(environment='ple', level='Catcher'), num_episodes=2) + # Assets missing: + # self.unittest(environment=dict(environment='ple', level='FlappyBird'), num_episodes=2) + # self.unittest(environment=dict(environment='ple', level='MonsterKong'), num_episodes=2) + self.unittest(environment=dict(environment='ple', level='Pixelcopter'), num_episodes=2) + self.unittest(environment=dict(environment='ple', level='Pong'), num_episodes=2) + self.unittest(environment=dict(environment='ple', level='PuckWorld'), num_episodes=2) + # TypeError: invalid start_pos argument + # self.unittest(environment=dict(environment='ple', level='RaycastMaze'), num_episodes=2) + self.unittest(environment=dict(environment='ple', level='Snake'), num_episodes=2) + self.unittest(environment=dict(environment='ple', level='WaterWorld'), num_episodes=2) + + @pytest.mark.skip(reason='not installed as part of travis') + def test_vizdoom(self): + self.start_tests(name='vizdoom') + self.unittest( + environment=dict(environment='vizdoom', level='test/data/basic.cfg'), num_episodes=2 + ) diff --git a/test/test_examples.py b/test/test_examples.py new file mode 100644 index 000000000..0d1b4580e --- /dev/null +++ b/test/test_examples.py @@ -0,0 +1,1043 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from tempfile import TemporaryDirectory +from threading import Thread +import unittest + +import numpy as np +import tensorflow as tf + +from tensorforce import Agent, Environment, Runner +from test.unittest_base import UnittestBase + + +class TestExamples(UnittestBase, unittest.TestCase): + + agent = dict( + config=dict(device='CPU', eager_mode=True, create_debug_assertions=True, tf_log_level=20) + ) + + def test_quickstart(self): + self.start_tests(name='quickstart') + + with TemporaryDirectory() as saver_directory, TemporaryDirectory() as summarizer_directory: + + # ==================== + + # OpenAI-Gym environment specification + environment = dict(environment='gym', level='CartPole-v1') + # or: environment = Environment.create( + # environment='gym', level='CartPole-v1', max_episode_timesteps=500) + + # PPO agent specification + agent = dict( + agent='ppo', + # Automatically configured network + network='auto', + # PPO optimization parameters + batch_size=10, update_frequency=2, learning_rate=3e-4, multi_step=10, + subsampling_fraction=0.33, + # Reward estimation + likelihood_ratio_clipping=0.2, discount=0.99, predict_terminal_values=False, + reward_processing=None, + # Baseline network and optimizer + baseline=dict(type='auto', size=32, depth=1), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3, multi_step=10), + # Regularization + l2_regularization=0.0, entropy_regularization=0.0, + # Preprocessing + state_preprocessing='linear_normalization', + # Exploration + exploration=0.0, variable_noise=0.0, + # Default additional config values + config=None, + # Save model every 10 updates and keep the 5 most recent checkpoints + saver=dict(directory=saver_directory, frequency=10, max_checkpoints=5), + # Log all available Tensorboard summaries + summarizer=dict(directory=summarizer_directory, summaries='all'), + # Do not record agent-environment interaction trace + recorder=None + ) + # or: Agent.create(agent='ppo', environment=environment, ...) + # with additional argument "environment" and, if applicable, "parallel_interactions" + + # Initialize the runner + runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500) + + # Train for 200 episodes + runner.run(num_episodes=20) + runner.close() + + # plus agent.close() and environment.close() if created separately + + # ==================== + + files = set(os.listdir(path=saver_directory)) + self.assertTrue(files == { + 'agent.json', 'agent-0.data-00000-of-00001', 'agent-0.index', + 'agent-10.data-00000-of-00001', 'agent-10.index', 'checkpoint' + }) + + directories = os.listdir(path=summarizer_directory) + self.assertEqual(len(directories), 1) + files = os.listdir(path=os.path.join(summarizer_directory, directories[0])) + self.assertEqual(len(files), 1) + self.assertTrue(files[0].startswith('events.out.tfevents.')) + + self.finished_test() + + def test_act_observe(self): + self.start_tests(name='act-observe') + + # ==================== + + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + + # Train for 100 episodes + for episode in range(10): + + # Episode using act and observe + states = environment.reset() + terminal = False + sum_reward = 0.0 + num_updates = 0 + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + num_updates += agent.observe(terminal=terminal, reward=reward) + sum_reward += reward + print('Episode {}: return={} updates={}'.format(episode, sum_reward, num_updates)) + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(10): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + print('Mean evaluation return:', sum_rewards / 100.0) + + # Close agent and environment + agent.close() + environment.close() + + # ==================== + + self.finished_test() + + def test_act_observe_vectorized(self): + self.start_tests(name='act-observe-vectorized') + + # ==================== + + num_parallel = 8 + environment = Environment.create(environment='custom_cartpole', max_episode_timesteps=500) + agent = Agent.create( + agent='benchmarks/configs/ppo.json', environment=environment, + parallel_interactions=num_parallel + ) + + # Train for 100 episodes + for episode in range(0, 10, num_parallel): + + # Episode using act and observe + parallel, states = environment.reset(num_parallel=num_parallel) + terminal = (parallel < 0) # all false + sum_rewards = 0.0 + num_updates = 0 + while not terminal.all(): + actions = agent.act(states=states, parallel=parallel) + next_parallel, states, terminal, reward = environment.execute(actions=actions) + num_updates += agent.observe(terminal=terminal, reward=reward, parallel=parallel) + parallel = next_parallel + sum_rewards += reward.sum() + print('Episode {}: return={} updates={}'.format( + episode, sum_rewards / num_parallel, num_updates + )) + + # Evaluate for 100 episodes + num_parallel = 4 + num_episodes = 10 + sum_rewards = 0.0 + for _ in range(0, num_episodes, num_parallel): + parallel, states = environment.reset(num_parallel=num_parallel) + internals = agent.initial_internals() + internals = [internals for _ in range(num_parallel)] + terminal = (parallel < 0) # all false + while not terminal.all(): + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + _, states, terminal, reward = environment.execute(actions=actions) + internals = [internal for internal, term in zip(internals, terminal) if not term] + sum_rewards += reward.sum() + print('Mean evaluation return:', sum_rewards / num_episodes) + + # Close agent and environment + agent.close() + environment.close() + + # ==================== + + self.finished_test() + + def test_act_experience_update(self): + self.start_tests(name='act-experience-update') + + # ==================== + + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + + # Train for 100 episodes + for episode in range(10): + + # Record episode experience + episode_states = list() + episode_internals = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + # Episode using independent-act and agent.intial_internals() + states = environment.reset() + internals = agent.initial_internals() + terminal = False + sum_reward = 0.0 + while not terminal: + episode_states.append(states) + episode_internals.append(internals) + actions, internals = agent.act(states=states, internals=internals, independent=True) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + sum_reward += reward + print('Episode {}: {}'.format(episode, sum_reward)) + + # Feed recorded experience to agent + agent.experience( + states=episode_states, internals=episode_internals, actions=episode_actions, + terminal=episode_terminal, reward=episode_reward + ) + + # Perform update + agent.update() + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(10): + states = environment.reset() + internals = agent.initial_internals() + terminal = False + while not terminal: + actions, internals = agent.act( + states=states, internals=internals, independent=True, deterministic=True + ) + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + print('Mean evaluation return:', sum_rewards / 100.0) + + # Close agent and environment + agent.close() + environment.close() + + # ==================== + + self.finished_test() + + def test_action_masking(self): + self.start_tests(name='action-masking') + + # ==================== + + class EnvironmentWithMasking(Environment): + """ + States: {0, 1, ..., 9, 10} + Actions: {-1, 0, 1} + Action masking: action = -1 invalid for state = 0, action = 1 invalid for state = 10 + Reward: + - Positive: [state < 5, action = 1] or [state > 5, action = -1] + - Negative: [state < 5, action = -1] or [state > 5, action = 1] + """ + + def __init__(self): + super().__init__() + + def states(self): + # States specification does not need to include action mask item + return dict(type=int, shape=(), num_values=11) + + def actions(self): + # Only discrete actions can be masked + return dict(type=int, shape=(), num_values=3) + + def reset(self): + # Initial state and associated action mask + self.state = np.random.randint(3, 7) + action_mask = np.asarray([self.state > 0, True, self.state < 10]) + + # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask") + states = dict(state=self.state, action_mask=action_mask) + + return states + + def execute(self, actions): + # Compute terminal and reward + terminal = False + if actions == 1: + reward = -np.abs(self.state / 5.0 - 1.0) + else: + reward = (1 - actions) * (self.state / 5.0 - 1.0) + + # Compute next state and associated action mask + self.state += actions - 1 + action_mask = np.asarray([self.state > 0, True, self.state < 10]) + + # Add action mask to states dictionary (mask item is "[NAME]_mask", here "action_mask") + states = dict(state=self.state, action_mask=action_mask) + + return states, terminal, reward + + agent = 'benchmarks/configs/ppo.json' + runner = Runner(agent=agent, environment=EnvironmentWithMasking, max_episode_timesteps=20) + runner.run(num_episodes=10) + runner.close() + + # ==================== + + self.finished_test() + + def test_export_saved_model(self): + self.start_tests(name='export-saved-model') + + # ==================== + + # Batch inputs + def batch(x): + return np.expand_dims(x, axis=0) + + # Unbatch outputs + def unbatch(x): + if isinstance(x, tf.Tensor): # TF tensor to NumPy array + x = x.numpy() + if x.shape == (1,): # Singleton array to Python value + return x.item() + else: + return np.squeeze(x, axis=0) + + # Apply function to leaf values in nested dict + # (required for nested states/actions) + def recursive_map(function, dictionary): + mapped = dict() + for key, value in dictionary.items(): + if isinstance(value, dict): + mapped[key] = recursive_map(function, value) + else: + mapped[key] = function(value) + return mapped + + # ==================== + + with TemporaryDirectory() as directory: + + # ==================== + + # Train agent + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + runner = Runner(agent='benchmarks/configs/ppo.json', environment=environment) + runner.run(num_episodes=10) + + # Save agent SavedModel + runner.agent.save(directory=directory, format='saved-model') + runner.close() + + # Model serving, potentially using different programming language etc + # (For regular model saving and loading within Python, see save_load_agent.py example) + + # Load agent SavedModel + agent = tf.saved_model.load(export_dir=directory) + + # Evaluate for 100 episodes + sum_rewards = 0.0 + for _ in range(10): + states = environment.reset() + + # Required in case of internal states: + # internals = agent.initial_internals() + # internals = recursive_map(batch, internals) + + terminal = False + while not terminal: + + states = batch(states) + # Required in case of nested states: + # states = recursive_map(batch, states) + + auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) + deterministic = True + + actions = agent.act(states, auxiliaries, deterministic) + # Required in case of internal states: + # actions_internals = agent.act(states, internals, auxiliaries, deterministic) + # actions, internals = actions_internals['actions'], actions_internals['internals'] + + actions = unbatch(actions) + # Required in case of nested actions: + # actions = recursive_map(unbatch, actions) + + states, terminal, reward = environment.execute(actions=actions) + sum_rewards += reward + + print('Mean evaluation return:', sum_rewards / 100.0) + environment.close() + + # ==================== + + self.finished_test() + + def test_multiactor_environment(self): + self.start_tests(name='multi-actor environment') + + # ==================== + + class MultiactorEnvironment(Environment): + """ + Example multi-actor environment, illustrating best-practice implementation pattern. + + State space: position in [0, 10]. + Action space: movement in {-1, 0, 1}. + Random start in [3, 7]. + Actor 1 perspective as is, actor 2 perspective mirrored. + Positive reward for being closer to 10. + """ + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='int', num_values=11) + + def actions(self): + return dict(type='int', num_values=3) + + def num_actors(self): + return 2 # Indicates that environment has multiple actors + + def reset(self): + # Always for multi-actor environments: initialize parallel indices + self._parallel_indices = np.arange(self.num_actors()) + + # Single shared environment logic, plus per-actor perspective + self._states = 3 + np.random.randint(5) + self.second_actor = True + states = np.stack([self._states, 10 - self._states], axis=0) + + # Always for multi-actor environments: return per-actor values + return self._parallel_indices.copy(), states + + def execute(self, actions): + # Single shared environment logic, plus per-actor perspective + if self.second_actor: + self.second_actor = self.second_actor and not (np.random.random_sample() < 0.1) + terminal = np.stack([False, not self.second_actor], axis=0) + delta = (actions[0] - 1) - (actions[1] - 1) + self._states = np.clip(self._states + delta, a_min=0, a_max=10) + states = np.stack([self._states, 10 - self._states], axis=0) + else: + terminal = np.stack([False], axis=0) + delta = (actions[0] - 1) + self._states = np.clip(self._states + delta, a_min=0, a_max=10) + states = np.stack([self._states], axis=0) + reward = (states - 5.0) / 5.0 + + # Always for multi-actor environments: update parallel indices, and return per-actor values + self._parallel_indices = self._parallel_indices[~terminal] + return self._parallel_indices.copy(), states, terminal, reward + + # Multi-actor runner, automatically if environment.num_actors() > 1 + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=MultiactorEnvironment, + max_episode_timesteps=10 + ) + runner.run(num_episodes=10) + + # ==================== + + self.finished_test() + + def test_parallelization(self): + self.start_tests(name='parallelization') + + # ==================== + + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + runner = Runner(agent=agent, environment=environment, num_parallel=4) + # Batch act/observe calls to agent, unless environment.is_vectorizable() + # (otherwise essentially equivalent to single environment) + runner.run(num_episodes=10, batch_agent_calls=True) + runner.close() + + # ==================== + + agent = 'benchmarks/configs/ppo.json' + environment = 'custom_cartpole' + runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500, num_parallel=4) + runner.run(num_episodes=10) + runner.close() + + # ==================== + + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + runner = Runner(agent=agent, environment=environment, num_parallel=4, remote='multiprocessing') + runner.run(num_episodes=10, batch_agent_calls=True) # optional: batch_agent_calls=True + runner.close() + + # ==================== + + agent = 'benchmarks/configs/ppo.json' + environment = 'benchmarks/configs/cartpole.json' + + def server(port): + Environment.create(environment=environment, remote='socket-server', port=port) + + server1 = Thread(target=server, kwargs=dict(port=65432)) + server2 = Thread(target=server, kwargs=dict(port=65433)) + server1.start() + server2.start() + + runner = Runner( + agent=agent, num_parallel=2, remote='socket-client', host='127.0.0.1', port=65432 + ) + runner.run(num_episodes=10) # optional: batch_agent_calls=True + runner.close() + + server1.join() + server2.join() + + # ==================== + + self.finished_test() + + def test_record_and_pretrain(self): + self.start_tests(name='record-and-pretrain') + + with TemporaryDirectory() as directory: + + # ==================== + + # Start recording traces after 80 episodes -- by then, the environment is solved + runner = Runner( + agent=dict( + agent='benchmarks/configs/ppo.json', + recorder=dict(directory=directory, start=8) + ), environment='benchmarks/configs/cartpole.json' + ) + runner.run(num_episodes=10) + runner.close() + + # ==================== + + # Trivial custom act function + def fn_act(states): + return int(states[2] < 0.0) + + # Record 20 episodes + runner = Runner( + agent=dict(agent=fn_act, recorder=dict(directory=directory)), + environment='benchmarks/configs/cartpole.json' + ) + # or: agent = Agent.create(agent=fn_act, recorder=dict(directory=directory)) + runner.run(num_episodes=2) + runner.close() + + # ==================== + + # Start recording traces after 80 episodes -- by then, the environment is solved + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=8) + runner.close() + + # Record 20 episodes + for episode in range(2, 4): + + # Record episode experience + episode_states = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + + # Evaluation episode + states = environment.reset() + terminal = False + while not terminal: + episode_states.append(states) + actions = agent.act(states=states, independent=True, deterministic=True) + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + + # Write recorded episode trace to npz file + np.savez_compressed( + file=os.path.join(directory, 'trace-{:09d}.npz'.format(episode)), + states=np.stack(episode_states, axis=0), + actions=np.stack(episode_actions, axis=0), + terminal=np.stack(episode_terminal, axis=0), + reward=np.stack(episode_reward, axis=0) + ) + + # ==================== + + # Pretrain a new agent on the recorded traces: for 30 iterations, feed the + # experience of one episode to the agent and subsequently perform one update + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + agent.pretrain(directory=directory, num_iterations=30, num_traces=1, num_updates=1) + + # Evaluate the pretrained agent + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, evaluation=True) + runner.close() + + # Close agent and environment + agent.close() + environment.close() + + # ==================== + + # Performance test + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) + agent.pretrain( + directory='test/data/ppo-traces', num_iterations=30, num_traces=1, num_updates=1 + ) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, evaluation=True) + self.assertTrue( + all(episode_return == 500.0 for episode_return in runner.episode_returns) + ) + runner.close() + agent.close() + environment.close() + + files = sorted(os.listdir(path=directory)) + self.assertEqual(len(files), 6) + self.assertTrue(all( + file.startswith('trace-') and file.endswith('0000000{}.npz'.format(n)) + for n, file in zip([0, 1, 2, 3, 8, 9], files) + )) + + self.finished_test() + + def test_save_load_agent(self): + self.start_tests(name='save-load-agent') + + with TemporaryDirectory() as checkpoint_directory, TemporaryDirectory() as numpy_directory: + + # ==================== + + # OpenAI-Gym environment initialization + environment = Environment.create(environment='benchmarks/configs/cartpole.json') + + # PPO agent initialization + agent = Agent.create( + agent='benchmarks/configs/ppo.json', environment=environment, + # Option 1: Saver - save agent periodically every 10 updates + # and keep the 5 most recent checkpoints + saver=dict(directory=checkpoint_directory, frequency=1, max_checkpoints=5), + ) + + # Runner initialization + runner = Runner(agent=agent, environment=environment) + + # Training + runner.run(num_episodes=10) + runner.close() + + # Option 2: Explicit save + # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, + # agent argument saver, specified above, uses 'checkpoint') + agent.save(directory=numpy_directory, format='numpy', append='episodes') + + # Close agent separately, since created separately + agent.close() + + # Load agent TensorFlow checkpoint + agent = Agent.load(directory=checkpoint_directory, format='checkpoint', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, evaluation=True) + runner.close() + agent.close() + + # Load agent NumPy weights + agent = Agent.load(directory=numpy_directory, format='numpy', environment=environment) + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, evaluation=True) + runner.close() + agent.close() + + # Close environment separately, since created separately + environment.close() + + # ==================== + + self.finished_test() + + def test_temperature_controller(self): + self.start_tests(name='temperature-controller') + + # ==================== + + import pandas as pd + import matplotlib.pyplot as plt + import numpy as np + import math + + ## Compute the response for a given action and current temperature + def respond(action, current_temp, tau): + return action + (current_temp - action) * math.exp(-1.0/tau) + + ## Actions of a series of on, then off + sAction = pd.Series(np.array([1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0])) + sResponse = np.zeros(sAction.size) + + ## Update the response with the response to the action + for i in range(sAction.size): + ## Get last response + if i == 0: + last_response = 0 + else: + last_response = sResponse[i - 1] + sResponse[i] = respond(sAction[i], last_response, 3.0) + + ## Assemble and plot + df = pd.DataFrame(list(zip(sAction, sResponse)), columns=['action', 'response']) + df.plot() + + # ==================== + + def reward(temp): + delta = abs(temp - 0.5) + if delta < 0.1: + return 0.0 + else: + return -delta + 0.1 + + temps = [x * 0.01 for x in range(100)] + rewards = [reward(x) for x in temps] + + fig=plt.figure(figsize=(12, 4)) + + plt.scatter(temps, rewards) + plt.xlabel('Temperature') + plt.ylabel('Reward') + plt.title('Reward vs. Temperature') + + # ==================== + + ###----------------------------------------------------------------------------- + ## Imports + from tensorforce.environments import Environment + from tensorforce.agents import Agent + + ###----------------------------------------------------------------------------- + ### Environment definition + class ThermostatEnvironment(Environment): + """This class defines a simple thermostat environment. It is a room with + a heater, and when the heater is on, the room temperature will approach + the max heater temperature (usually 1.0), and when off, the room will + decay to a temperature of 0.0. The exponential constant that determines + how fast it approaches these temperatures over timesteps is tau. + """ + def __init__(self): + ## Some initializations. Will eventually parameterize this in the constructor. + self.tau = 3.0 + self.current_temp = np.random.random(size=(1,)) + + super().__init__() + + def states(self): + return dict(type='float', shape=(1,), min_value=0.0, max_value=1.0) + + def actions(self): + """Action 0 means no heater, temperature approaches 0.0. Action 1 means + the heater is on and the room temperature approaches 1.0. + """ + return dict(type='int', num_values=2) + + # Optional, should only be defined if environment has a natural maximum + # episode length + def max_episode_timesteps(self): + return super().max_episode_timesteps() + + # Optional + def close(self): + super().close() + + def reset(self): + """Reset state. + """ + # state = np.random.random(size=(1,)) + self.timestep = 0 + self.current_temp = np.random.random(size=(1,)) + return self.current_temp + + def response(self, action): + """Respond to an action. When the action is 1, the temperature + exponentially decays approaches 1.0. When the action is 0, + the current temperature decays towards 0.0. + """ + return action + (self.current_temp - action) * math.exp(-1.0 / self.tau) + + def reward_compute(self): + """ The reward here is 0 if the current temp is between 0.4 and 0.6, + else it is distance the temp is away from the 0.4 or 0.6 boundary. + + Return the value within the numpy array, not the numpy array. + """ + delta = abs(self.current_temp - 0.5) + if delta < 0.1: + return 0.0 + else: + return -delta[0] + 0.1 + + def execute(self, actions): + ## Check the action is either 0 or 1 -- heater on or off. + assert actions == 0 or actions == 1 + + ## Increment timestamp + self.timestep += 1 + + ## Update the current_temp + self.current_temp = self.response(actions) + + ## Compute the reward + reward = self.reward_compute() + + ## The only way to go terminal is to exceed max_episode_timestamp. + ## terminal == False means episode is not done + ## terminal == True means it is done. + terminal = False + + return self.current_temp, terminal, reward + + ###----------------------------------------------------------------------------- + ### Create the environment + ### - Tell it the environment class + ### - Set the max timestamps that can happen per episode + environment = environment = Environment.create( + environment=ThermostatEnvironment, + max_episode_timesteps=100) + + # ==================== + + agent = Agent.create( + agent='tensorforce', environment=environment, update=64, + optimizer=dict(optimizer='adam', learning_rate=1e-3), objective='policy_gradient', + reward_estimation=dict(horizon=1) + ) + + # ==================== + + ### Initialize + environment.reset() + + ## Creation of the environment via Environment.create() creates + ## a wrapper class around the original Environment defined here. + ## That wrapper mainly keeps track of the number of timesteps. + ## In order to alter the attributes of your instance of the original + ## class, like to set the initial temp to a custom value, like here, + ## you need to access the `environment` member of this wrapped class. + ## That is why you see the way to set the current_temp like below. + environment.current_temp = np.array([0.5]) + states = environment.current_temp + + internals = agent.initial_internals() + terminal = False + + ### Run an episode + temp = [environment.current_temp[0]] + while not terminal: + actions, internals = agent.act(states=states, internals=internals, independent=True) + states, terminal, reward = environment.execute(actions=actions) + temp += [states[0]] + + ### Plot the run + plt.figure(figsize=(12, 4)) + ax=plt.subplot() + ax.set_ylim([0.0, 1.0]) + plt.plot(range(len(temp)), temp) + plt.hlines(y=0.4, xmin=0, xmax=99, color='r') + plt.hlines(y=0.6, xmin=0, xmax=99, color='r') + plt.xlabel('Timestep') + plt.ylabel('Temperature') + plt.title('Temperature vs. Timestep') + plt.show() + + # Train for 200 episodes + for _ in range(10): + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + # ==================== + + ### Initialize + environment.reset() + + ## Creation of the environment via Environment.create() creates + ## a wrapper class around the original Environment defined here. + ## That wrapper mainly keeps track of the number of timesteps. + ## In order to alter the attributes of your instance of the original + ## class, like to set the initial temp to a custom value, like here, + ## you need to access the `environment` member of this wrapped class. + ## That is why you see the way to set the current_temp like below. + environment.current_temp = np.array([1.0]) + states = environment.current_temp + + internals = agent.initial_internals() + terminal = False + + ### Run an episode + temp = [environment.current_temp[0]] + while not terminal: + actions, internals = agent.act(states=states, internals=internals, independent=True) + states, terminal, reward = environment.execute(actions=actions) + temp += [states[0]] + + ### Plot the run + plt.figure(figsize=(12, 4)) + ax=plt.subplot() + ax.set_ylim([0.0, 1.0]) + plt.plot(range(len(temp)), temp) + plt.hlines(y=0.4, xmin=0, xmax=99, color='r') + plt.hlines(y=0.6, xmin=0, xmax=99, color='r') + plt.xlabel('Timestep') + plt.ylabel('Temperature') + plt.title('Temperature vs. Timestep') + plt.show() + + # ==================== + + self.finished_test() + + + def test_vectorized_environment(self): + self.start_tests(name='vectorized environment') + + # ==================== + + class VectorizedEnvironment(Environment): + """ + Example vectorized environment, illustrating best-practice implementation pattern. + + State space: position in [0, 10]. + Action space: movement in {-1, 0, 1}. + Random start in [0, 3] or [7, 10]. + Positive reward for moving towards the center 5. + """ + + def __init__(self): + super().__init__() + + def states(self): + return dict(type='int', num_values=11) + + def actions(self): + return dict(type='int', num_values=3) + + def is_vectorizable(self): + return True # Indicates that environment is vectorizable + + def reset(self, num_parallel=None): + # Always for vectorized environments: initialize parallel indices + self._is_parallel = (num_parallel is not None) + if self._is_parallel: + self._parallel_indices = np.arange(num_parallel) + else: + self._parallel_indices = np.arange(1) + + # Vectorized environment logic + is_high = (np.random.random_sample(size=self._parallel_indices.shape) < 0.5) + offset = np.random.randint(4, size=self._parallel_indices.shape) + self._states = np.where(is_high, 10 - offset, offset) + + # Always for vectorized environments: return un-/vectorized values + if self._is_parallel: + return self._parallel_indices.copy(), self._states.copy() + else: + return self._states[0] + + def execute(self, actions): + # Always for vectorized environments: expand actions if non-vectorized + if not self._is_parallel: + actions = np.expand_dims(actions, axis=0) + + # Vectorized environment logic + reward = np.select( + condlist=[self._states < 5, self._states > 5], + choicelist=[(actions == 2).astype(np.float32), (actions == 0).astype(np.float32)], + default=np.ones(shape=self._parallel_indices.shape, dtype=np.float32) + ) + terminal = (np.random.random_sample(size=self._parallel_indices.shape) < 0.1) + self._states = np.clip(self._states + (actions - 1), a_min=0, a_max=10) + + # Always for vectorized environments: update parallel indices and states, + # and return un-/vectorized values + if self._is_parallel: + self._parallel_indices = self._parallel_indices[~terminal] + self._states = self._states[~terminal] + return self._parallel_indices.copy(), self._states.copy(), terminal, reward + else: + return self._states[0], terminal[0], reward[0] + + # Non-vectorized runner + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=VectorizedEnvironment, + max_episode_timesteps=10 + ) + runner.run(num_episodes=10) + + # Vectorized runner, automatically if num_parallel > 1 and environment.is_vectorizable() + # (and remote argument not specified) + runner = Runner( + agent='benchmarks/configs/ppo.json', + environment=VectorizedEnvironment, + max_episode_timesteps=10, + num_parallel=16 + ) + runner.run(num_episodes=10) + + # ==================== + + self.finished_test() diff --git a/test/test_features.py b/test/test_features.py new file mode 100644 index 000000000..d55723292 --- /dev/null +++ b/test/test_features.py @@ -0,0 +1,43 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from random import random +from tempfile import TemporaryDirectory +import unittest + +import numpy as np + +from tensorforce import Agent, Environment, Runner +from test.unittest_base import UnittestBase + + +class TestFeatures(UnittestBase, unittest.TestCase): + + def test_masking(self): + # FEATURES.MD + self.start_tests(name='masking') + + agent = Agent.create(agent=self.agent_spec( + states=dict(type='float', shape=(10,)), + actions=dict(type='int', shape=(), num_values=3) + )) + + states = dict( + state=np.random.random_sample(size=(10,)), # state (default name: "state") + action_mask=[True, False, True] # mask as'[ACTION-NAME]_mask' (default name: "action") + ) + action = agent.act(states=states) + assert action != 1 diff --git a/test/test_layers.py b/test/test_layers.py new file mode 100644 index 000000000..dfc68b623 --- /dev/null +++ b/test/test_layers.py @@ -0,0 +1,247 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestLayers(UnittestBase, unittest.TestCase): + + def test_layers(self): + self.start_tests(name='layers') + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + network = [ + dict(type='register', tensor='test'), + dict(type='retrieve', tensors='test'), + dict(type='retrieve', tensors=('state', 'test'), aggregation='product') + ] + self.unittest(states=states, policy=network) + + states = dict( + int_state=dict(type='int', shape=(1, 2), num_values=4), + float_state=dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + ) + network = [ + dict(type='retrieve', tensors='float_state'), + dict(type='block', name='test1', layers=[ + dict(type='dense', size=4), + dict(type='block', name='test2', layers=[ + dict(type='dense', size=4) + ]), + dict(type='reuse', layer='test2'), + dict(type='block', name='test3', layers=[ + dict(type='lstm', size=4, horizon=2) + ]), + dict(type='reuse', layer='test2') + ]) + ] + baseline = [ + dict(type='retrieve', tensors='float_state'), + dict(type='reuse', layer='test1'), + dict(type='reuse', layer='test2') + ] + self.unittest(states=states, policy=network, baseline=baseline) + + def test_attention(self): + self.start_tests(name='attention') + + states = dict(type='float', shape=(2, 3), min_value=1.0, max_value=2.0) + network = [ + dict(type='self_attention', size=8), + dict(type='pooling', reduction='max') + ] + self.unittest(states=states, policy=network) + + states = dict(type='float', shape=(2, 2, 3), min_value=1.0, max_value=2.0) + network = [ + dict(type='self_attention', size=8, attention_size=7), + dict(type='pooling', reduction='max') + ] + self.unittest(states=states, policy=network) + + def test_convolution(self): + self.start_tests(name='convolution') + + states = dict(type='float', shape=(2, 3), min_value=1.0, max_value=2.0) + actions = dict( + bool_action=dict(type='bool', shape=(2,)), + int_action=dict(type='int', shape=(2, 2), num_values=4), + float_action=dict(type='float', shape=(2,), min_value=1.0, max_value=2.0), + beta_action=dict(type='float', shape=(2, 2), min_value=1.0, max_value=2.0) + ) + network = [ + dict(type='conv1d', size=8), + dict(type='conv1d_transpose', size=8), + dict(type='linear', size=8) + ] + self.unittest(states=states, actions=actions, policy=network) + + states = dict(type='float', shape=(2, 2, 3), min_value=1.0, max_value=2.0) + actions = dict( + bool_action=dict(type='bool', shape=(2, 2)), + int_action=dict(type='int', shape=(2, 2, 2), num_values=4), + float_action=dict(type='float', shape=(2, 2), min_value=1.0, max_value=2.0), + beta_action=dict(type='float', shape=(2, 2, 2), min_value=1.0, max_value=2.0) + ) + network = [ + dict(type='conv2d', size=8), + dict(type='conv2d_transpose', size=8), + dict(type='linear', size=8) + ] + self.unittest(states=states, actions=actions, policy=network) + + def test_dense(self): + self.start_tests(name='dense') + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + network = [ + dict(type='dense', size=8), + dict(type='linear', size=8) + ] + self.unittest(states=states, policy=network) + + def test_embedding(self): + self.start_tests(name='embedding') + + states = dict(type='int', shape=(), num_values=5) + network = [dict(type='embedding', size=8)] + self.unittest(states=states, policy=network) + + def test_input_rnn(self): + self.start_tests(name='input-rnn') + + states = dict(type='float', shape=(2, 3), min_value=1.0, max_value=2.0) + network = [ + dict(type='input_rnn', cell='gru', size=8, return_final_state=False), + dict(type='input_lstm', size=8) + ] + self.unittest(states=states, policy=network) + + def test_keras(self): + self.start_tests(name='keras') + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + network = [dict(type='keras', layer='Dense', units=8)] + self.unittest(states=states, policy=network) + + def test_misc(self): + self.start_tests(name='misc') + + states = dict(type='float', shape=(3, 2), min_value=1.0, max_value=2.0) + network = [ + dict(type='activation', nonlinearity='tanh'), + dict(type='dropout', rate=0.5), + (lambda x: x + 1.0), + dict(type='reshape', shape=6), + dict( + type='function', function=(lambda x: x[:, :2]), + output_spec=dict(type='float', shape=(2,)) + ) + ] + self.unittest(states=states, policy=network) + + def test_normalization(self): + self.start_tests(name='normalization') + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + network = [ + dict(type='linear_normalization'), + dict(type='exponential_normalization', decay=0.99), + dict(type='instance_normalization') + ] + # 'batch_normalization' used by all tests + self.unittest(states=states, policy=network) + + def test_pooling(self): + self.start_tests(name='pooling') + + states = dict(type='float', shape=(2, 3), min_value=1.0, max_value=2.0) + network = [ + dict(type='pool1d', reduction='average'), + dict(type='flatten') + ] + self.unittest(states=states, policy=network) + + states = dict(type='float', shape=(2, 2, 3), min_value=1.0, max_value=2.0) + network = [ + dict(type='pool2d', reduction='max'), + dict(type='pooling', reduction='max') + ] + self.unittest(states=states, policy=network) + + def test_preprocessing(self): + self.start_tests(name='preprocessing') + + states = dict(type='float', shape=(), min_value=-1.0, max_value=2.0) + state_preprocessing = [ + dict(type='sequence', length=3, concatenate=False), + dict(type='clipping', lower=-1.0, upper=1.0), + dict(type='linear_normalization') + ] + reward_processing = [dict(type='clipping', upper=1.0)] + network = [dict(type='dense', size=8)] + self.unittest( + states=states, experience_update=False, policy=network, + reward_estimation=dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + reward_processing=reward_processing, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), + state_preprocessing=state_preprocessing, + ) + + states = dict( + state1=dict(type='float', shape=(4, 4, 3), min_value=1.0, max_value=2.0), + state2=dict(type='float', shape=(), min_value=-1.0, max_value=2.0) + ) + state_preprocessing = dict( + state1=[ + dict(type='image', height=2, width=2, grayscale=True), + dict(type='deltafier', concatenate=0), + dict(type='sequence', length=4), + dict(type='linear_normalization') + ], + state2=None + ) + reward_processing = dict(type='deltafier') + network = [dict(type='retrieve', tensors='state1'), dict(type='reshape', shape=32)] + # TODO: buffer_observe incompatible with Deltafier/Sequence expecting single-step inputs + self.unittest( + states=states, experience_update=False, policy=network, + reward_estimation=dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + reward_processing=reward_processing, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), + state_preprocessing=state_preprocessing, + config=dict( + buffer_observe=1, device='CPU', eager_mode=True, create_debug_assertions=True, + tf_log_level=20 + ) + ) + + def test_rnn(self): + self.start_tests(name='rnn') + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + network = [dict(type='rnn', cell='gru', size=8, horizon=2)] + self.unittest(states=states, policy=network) + + network = [dict(type='lstm', size=7, horizon=1)] + self.unittest(states=states, policy=network) diff --git a/test/test_memories.py b/test/test_memories.py new file mode 100644 index 000000000..af8f6eeef --- /dev/null +++ b/test/test_memories.py @@ -0,0 +1,47 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestMemories(UnittestBase, unittest.TestCase): + + def test_recent(self): + self.start_tests(name='recent') + + memory = dict(type='recent') + update = dict(unit='timesteps', batch_size=4) + self.unittest(update=update, memory=memory) + + memory = dict(type='recent') + update = dict(unit='episodes', batch_size=1) + self.unittest(update=update, memory=memory) + + def test_replay(self): + self.start_tests(name='replay') + + memory = dict(type='replay') + update = dict(unit='timesteps', batch_size=4) + self.unittest(update=update, memory=memory) + + memory = dict(type='replay') + update = dict(unit='episodes', batch_size=1) + self.unittest(update=update, memory=memory) + + memory = 100 + update = 4 + self.unittest(update=update, memory=memory) diff --git a/test/test_objectives.py b/test/test_objectives.py new file mode 100644 index 000000000..53d616e43 --- /dev/null +++ b/test/test_objectives.py @@ -0,0 +1,106 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestObjectives(UnittestBase, unittest.TestCase): + + def test_deterministic_policy_gradient(self): + self.start_tests(name='deterministic-policy-gradient') + + actions = dict( + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0), + beta_action=dict(type='float', shape=(), min_value=1.0, max_value=2.0) + ) + # TODO: no-RNN restriction can be removed + policy = dict(network=dict(type='auto', size=8, depth=1, rnn=False), distributions=dict( + gaussian_action2=dict(type='gaussian', stddev_mode='global'), beta_action='beta' + )) + objective = 'deterministic_policy_gradient' + reward_estimation = dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + predict_action_values=True, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ) + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=False)) + baseline_objective = 'action_value' + self.unittest( + actions=actions, policy=policy, objective=objective, + reward_estimation=reward_estimation, baseline=baseline, + baseline_objective=baseline_objective + ) + + def test_plus(self): + self.start_tests(name='plus') + + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + objective = dict(type='plus', objective1='policy_gradient', objective2='action_value') + self.unittest(actions=actions, objective=objective) + + def test_policy_gradient(self): + self.start_tests(name='policy-gradient') + + objective = 'policy_gradient' + self.unittest(objective=objective) + + objective = dict(type='policy_gradient', importance_sampling=True) + self.unittest(objective=objective) + + objective = dict(type='policy_gradient', clipping_value=1.0) + self.unittest(objective=objective) + + objective = dict(type='policy_gradient', importance_sampling=True, clipping_value=0.2) + self.unittest(objective=objective) + + objective = dict(type='policy_gradient', early_reduce=True) + self.unittest(objective=objective) + + def test_value(self): + self.start_tests(name='value') + + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2) + ) + + # State value does not affect advantage variables of main policy + objective = 'state_value' + self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0) + + policy = dict(network=dict(type='auto', size=8, depth=1, rnn=2)) + objective = dict(type='value', value='action') + self.unittest( + actions=actions, policy=policy, objective=objective, entropy_regularization=0.0 + ) + + objective = dict(type='value', value='state', huber_loss=1.0) + self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0) + + objective = dict(type='action_value', early_reduce=True) + self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0) diff --git a/test/test_optimizers.py b/test/test_optimizers.py new file mode 100644 index 000000000..0a21398a5 --- /dev/null +++ b/test/test_optimizers.py @@ -0,0 +1,111 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestOptimizers(UnittestBase, unittest.TestCase): + + def test_evolutionary(self): + self.start_tests(name='evolutionary') + + self.unittest(optimizer=dict(type='evolutionary', learning_rate=1e-3)) + + self.unittest(optimizer=dict(type='evolutionary', learning_rate=1e-3, num_samples=5)) + + def test_optimizer_wrapper(self): + self.start_tests(name='optimizer-wrapper') + + self.unittest(optimizer=dict( + optimizer='adam', learning_rate=1e-1, clipping_threshold=1e-2, multi_step=5, + subsampling_fraction=0.5, linesearch_iterations=3, doublecheck_update=True + )) + + self.unittest(optimizer=dict(optimizer='adam', subsampling_fraction=2)) + + def test_natural_gradient(self): + self.start_tests(name='natural-gradient') + + self.unittest( + optimizer=dict(type='natural_gradient', learning_rate=1e-3, only_positive_updates=False) + ) + + def test_plus(self): + self.start_tests(name='plus') + + optimizer = dict( + type='plus', optimizer1=dict(type='adam', learning_rate=1e-3), + optimizer2=dict(type='adagrad', learning_rate=1e-3) + ) + self.unittest(optimizer=optimizer) + + def test_synchronization(self): + self.start_tests(name='synchronization') + + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + # Requires same size, but can still vary RNN horizon + baseline = dict( + type='parametrized_distributions', network=dict(type='auto', size=8, depth=1, rnn=1), + distributions=dict( + int_action2=dict(type='categorical', temperature_mode='predicted'), + int_action3=dict(type='categorical', temperature_mode='global'), + gaussian_action2=dict( + type='gaussian', stddev_mode='global', bounded_transform='clipping' + ) + ) + ) + # Using policy_gradient here, since action_value is covered by DQN + baseline_objective = 'policy_gradient' + self.unittest( + actions=actions, baseline=baseline, + baseline_optimizer=dict(type='synchronization', update_weight=1.0), + baseline_objective=baseline_objective + ) + + self.unittest( + actions=actions, baseline=baseline, + baseline_optimizer=dict(type='synchronization', update_weight=1.0, sync_frequency=2), + baseline_objective=baseline_objective + ) + + def test_tf_optimizer(self): + self.start_tests(name='tf-optimizer') + + self.unittest(optimizer=dict(type='adam', learning_rate=1e-3)) + + self.unittest(optimizer=dict(type='adam', learning_rate=1e-3, gradient_norm_clipping=1.0)) + + try: + import tensorflow_addons as tfa + + self.unittest(optimizer=dict( + type='tf_optimizer', optimizer='radam', learning_rate=1e-3, + decoupled_weight_decay=0.01, lookahead=True, moving_average=True + )) + + except ModuleNotFoundError: + pass + except TypeError: + # TODO: temporary for version 0.11.1 + pass diff --git a/test/test_parameters.py b/test/test_parameters.py new file mode 100644 index 000000000..d09dd9c83 --- /dev/null +++ b/test/test_parameters.py @@ -0,0 +1,137 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestParameters(UnittestBase, unittest.TestCase): + + def float_unittest(self, exploration): + agent, environment = self.prepare(exploration=exploration) + + states = environment.reset() + actions = agent.act(states=states) + exploration1 = agent.model.exploration.value().numpy().item() + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + actions = agent.act(states=states) + exploration2 = agent.model.exploration.value().numpy().item() + if not isinstance(exploration, dict) or exploration['type'] == 'constant': + self.assertEqual(exploration2, exploration1) + else: + self.assertNotEqual(exploration2, exploration1) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + agent.close() + environment.close() + + self.finished_test() + + def int_unittest(self, horizon): + agent, environment = self.prepare(reward_estimation=dict(horizon=horizon)) + + states = environment.reset() + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + horizon1 = agent.model.reward_horizon.value().numpy().item() + + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + horizon2 = agent.model.reward_horizon.value().numpy().item() + if not isinstance(horizon, dict) or horizon['type'] == 'constant': + self.assertEqual(horizon2, horizon1) + else: + self.assertNotEqual(horizon2, horizon1) + + agent.close() + environment.close() + + self.finished_test() + + def test_constant(self): + self.start_tests(name='constant') + + exploration = 0.1 + self.float_unittest(exploration=exploration) + + horizon = 4 + self.int_unittest(horizon=horizon) + + def test_decaying(self): + self.start_tests(name='decaying') + + exploration = dict( + type='decaying', decay='exponential', unit='timesteps', num_steps=5, initial_value=0.1, + decay_rate=0.5 + ) + self.float_unittest(exploration=exploration) + + horizon = dict( + type='polynomial', unit='timesteps', num_steps=1, initial_value=2, final_value=4, + power=2 + ) + self.int_unittest(horizon=horizon) + + def test_exponential(self): + self.start_tests(name='exponential') + + # SPECIFICATION.MD + exploration = dict( + type='exponential', unit='timesteps', num_steps=5, initial_value=0.1, decay_rate=0.5 + ) + self.float_unittest(exploration=exploration) + + def test_linear(self): + self.start_tests(name='linear') + + exploration = dict( + type='linear', unit='timesteps', num_steps=5, initial_value=0.1, final_value=0.5 + ) + self.float_unittest(exploration=exploration) + + # SPECIFICATION.MD + horizon = dict(type='linear', unit='timesteps', num_steps=1, initial_value=2, final_value=4) + self.int_unittest(horizon=horizon) + + def test_ornstein_uhlenbeck(self): + self.start_tests(name='ornstein-uhlenbeck') + + exploration = dict(type='ornstein_uhlenbeck', absolute=True) + self.float_unittest(exploration=exploration) + + def test_piecewise_constant(self): + self.start_tests(name='piecewise-constant') + + exploration = dict( + type='piecewise_constant', unit='timesteps', boundaries=[1], values=[0.1, 0.0] + ) + self.float_unittest(exploration=exploration) + + horizon = dict( + type='piecewise_constant', dtype='int', unit='timesteps', boundaries=[1], values=[1, 2] + ) + self.int_unittest(horizon=horizon) + + def test_random(self): + self.start_tests(name='random') + + exploration = dict(type='random', distribution='uniform') + self.float_unittest(exploration=exploration) diff --git a/test/test_policies.py b/test/test_policies.py new file mode 100644 index 000000000..90c2dafe0 --- /dev/null +++ b/test/test_policies.py @@ -0,0 +1,124 @@ +# Copyright 2021 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestPolicies(UnittestBase, unittest.TestCase): + + def test_keras_network(self): + self.start_tests(name='keras network') + + import tensorflow as tf + + class Model(tf.keras.Model): + + def __init__(self): + super().__init__() + self.layer1 = tf.keras.layers.Dense(4, activation=tf.nn.relu) + self.layer2 = tf.keras.layers.Dense(5, activation=tf.nn.relu) + + def call(self, inputs): + x = self.layer1(inputs) + return self.layer2(x) + + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + self.unittest(states=states, policy=Model) + + class Model(tf.keras.Model): + + def __init__(self, size): + super().__init__() + self.layer1 = tf.keras.layers.Dense(4, activation=tf.nn.relu) + self.layer2 = tf.keras.layers.Embedding(4, 4) + self.layer3 = tf.keras.layers.Dense(size, activation=tf.nn.relu) + + def call(self, inputs): + y = self.layer1(inputs[0]) + x = self.layer2(inputs[1]) + return self.layer3(tf.concat(values=[x, y], axis=1)) + + states = dict( + int_state=dict(type='int', shape=(), num_values=4), + float_state=dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + ) + self.unittest(states=states, policy=dict(network=dict(type='keras', model=Model, size=5))) + + self.unittest(states=states, policy=Model(size=5)) + + def model(size): + return Model(size=size) + + self.unittest(states=states, policy=dict(network=dict(type='keras', model=model, size=5))) + + def test_multi_output(self): + self.start_tests(name='multi-output') + self.unittest( + states=dict( + state1=dict(type='float', shape=(2,), min_value=-1.0, max_value=2.0), + state2=dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + ), + actions=dict( + action1=dict(type='int', shape=(), num_values=3), + action2=dict(type='int', shape=(), num_values=4) + ), + policy=dict(network=[ + [ + dict(type='retrieve', tensors=['state1']), + dict(type='dense', size=16), + dict(type='register', tensor='action1-embedding') + ], + [ + dict(type='retrieve', tensors=['state2', 'action1-embedding']), + dict(type='dense', size=12) + ] + ], single_output=False), + baseline=dict(type='parametrized_value_policy', network=[ + [ + dict(type='retrieve', tensors=['state1']), + dict(type='dense', size=16), + dict(type='register', tensor='action1-embedding') + ], + [ + dict(type='retrieve', tensors=['state2', 'action1-embedding']), + dict(type='dense', size=12), + dict(type='register', tensor='state-embedding') + ] + ], single_output=False), + ) + + def test_categorical_skip_linear(self): + self.start_tests(name='categorical skip-linear') + self.unittest( + states=dict(type='float', shape=(3,), min_value=1.0, max_value=2.0), + actions=dict(type='int', shape=(2,), num_values=4), + policy=dict( + network=[dict(type='dense', size=8), dict(type='reshape', shape=(2, 4))], + distributions=dict(type='categorical', skip_linear=True) + ) + ) + + def test_categorical_skip_linear_no_shape(self): + self.start_tests(name='categorical skip-linear empty shape') + self.unittest( + states=dict(type='float', shape=(3,), min_value=1.0, max_value=2.0), + actions=dict(type='int', num_values=4), + policy=dict( + network=[dict(type='dense', size=4)], + distributions=dict(type='categorical', skip_linear=True) + ) + ) diff --git a/test/test_precision.py b/test/test_precision.py new file mode 100644 index 000000000..b2087130c --- /dev/null +++ b/test/test_precision.py @@ -0,0 +1,56 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +import numpy as np +import tensorflow as tf + +from tensorforce import util +from tensorforce.core import tf_util +from test.unittest_base import UnittestBase + + +class TestPrecision(UnittestBase, unittest.TestCase): + + def test_precision(self): + self.start_tests() + + try: + util.np_dtype_mapping = dict(bool=np.bool_, int=np.int32, float=np.float16) + tf_util.DTYPE_MAPPING = dict(bool=tf.bool, int=tf.int32, float=tf.float16) + + # TODO: TensorFlow optimizers seem incompatible with float16 + optimizer = dict(optimizer='evolutionary', learning_rate=1e-3) + baseline_optimizer = dict(optimizer='evolutionary', learning_rate=1e-3) + self.unittest( + optimizer=optimizer, baseline_optimizer=baseline_optimizer, + config=dict( + device='CPU', eager_mode=True, create_debug_assertions=True, tf_log_level=20 + ) + ) + + util.np_dtype_mapping = dict(bool=np.bool_, int=np.int64, float=np.float64) + tf_util.DTYPE_MAPPING = dict(bool=tf.bool, int=tf.int64, float=tf.float64) + + self.unittest() + + except BaseException as exc: + raise exc + self.assertTrue(expr=False) + + finally: + util.np_dtype_mapping = dict(bool=np.bool_, int=np.int64, float=np.float32) + tf_util.DTYPE_MAPPING = dict(bool=tf.bool, int=tf.int64, float=tf.float32) diff --git a/test/test_reward_estimation.py b/test/test_reward_estimation.py new file mode 100644 index 000000000..ff6c9f50e --- /dev/null +++ b/test/test_reward_estimation.py @@ -0,0 +1,276 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from test.unittest_base import UnittestBase + + +class TestRewardEstimation(UnittestBase, unittest.TestCase): + + agent = dict( + policy=dict(network=dict(type='auto', size=8, depth=1, rnn=2), distributions=dict( + int_action2=dict(type='categorical', temperature_mode='predicted'), + int_action3=dict(type='categorical', temperature_mode='global'), + gaussian_action2=dict(type='gaussian', stddev_mode='global'), + gaussian_action3=dict( + type='gaussian', stddev_mode='global', bounded_transform='clipping' + ), beta_action='beta' + )), update=dict(unit='timesteps', batch_size=4, frequency=0.5), + optimizer=dict(optimizer='adam', learning_rate=1e-3), + objective='policy_gradient', reward_estimation=dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + reward_processing=dict(type='clipping', lower=-1.0, upper=1.0), + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), l2_regularization=0.01, entropy_regularization=0.01, + state_preprocessing='linear_normalization', + exploration=0.01, variable_noise=0.01, + config=dict(device='CPU', eager_mode=True, create_debug_assertions=True, tf_log_level=20), + tracking='all' + ) + + def test_no_horizon_estimate(self): + self.start_tests(name='no horizon estimate') + + # shortest horizon + reward_estimation = dict( + horizon=1, discount=0.99, predict_horizon_values=False, + return_processing='batch_normalization' + ) + self.unittest(reward_estimation=reward_estimation) + + # horizon as long as episode + reward_estimation = dict( + horizon=10, discount=0.99, predict_horizon_values=False, + return_processing='batch_normalization' + ) + self.unittest(reward_estimation=reward_estimation) + + # episode horizon + reward_estimation = dict( + horizon='episode', discount=0.99, predict_horizon_values=False, + return_processing='batch_normalization' + ) + self.unittest(reward_estimation=reward_estimation) + + def test_early_horizon_estimate(self): + self.start_tests(name='early horizon estimate') + + # TODO: action value doesn't exist for Beta + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + reward_estimation = dict( + horizon='episode', predict_horizon_values='early', predict_action_values=True, + return_processing='batch_normalization' + ) + # Implicit baseline = policy + self.unittest(actions=actions, reward_estimation=reward_estimation, config=dict( + buffer_observe=3, device='CPU', eager_mode=True, create_debug_assertions=True, + tf_log_level=20 + )) + + # TODO: action value doesn't exist for Beta + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + update = dict(unit='episodes', batch_size=1) + reward_estimation = dict( + horizon=3, predict_horizon_values='early', return_processing='batch_normalization' + ) + # Implicit baseline = policy + baseline_optimizer = dict(optimizer='adam', learning_rate=1e-3) + baseline_objective = 'state_value' + self.unittest( + actions=actions, update=update, reward_estimation=reward_estimation, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective, + config=dict( + buffer_observe='episode', device='CPU', eager_mode=True, + create_debug_assertions=True, tf_log_level=20 + ) # or 1? + ) + + reward_estimation = dict( + horizon='episode', predict_horizon_values='early', predict_terminal_values=True, + return_processing='batch_normalization' + ) + # TODO: baseline horizon has to be equal to policy horizon + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=2)) + # Implicit baseline_optimizer = 1.0 + baseline_objective = 'state_value' + self.unittest( + reward_estimation=reward_estimation, baseline=baseline, + baseline_objective=baseline_objective + ) + + # Action-value baseline compatible with discrete actions + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2) + ) + reward_estimation = dict( + horizon=3, predict_horizon_values='early', predict_action_values=True, + predict_terminal_values=True, return_processing='batch_normalization' + ) + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=1)) + baseline_optimizer = dict(optimizer='adam', learning_rate=1e-3) + baseline_objective = 'action_value' + self.unittest( + actions=actions, reward_estimation=reward_estimation, baseline=baseline, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective + ) + + def test_late_horizon_estimate(self): + self.start_tests(name='late horizon estimate') + + # TODO: action value doesn't exist for Beta + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + reward_estimation = dict( + horizon=3, predict_horizon_values='late', return_processing='batch_normalization' + ) + # Implicit baseline = policy + # Implicit baseline_optimizer = 1.0 + baseline_objective = 'state_value' + self.unittest( + actions=actions, reward_estimation=reward_estimation, + baseline_objective=baseline_objective + ) + + # Action-value baseline compatible with discrete actions + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2) + ) + reward_estimation = dict( + horizon=3, predict_horizon_values='late', predict_action_values=True, + return_processing='batch_normalization' + ) + # TODO: baseline horizon has to be equal to policy horizon + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=2)) + baseline_optimizer = 2.0 + baseline_objective = 'action_value' + self.unittest( + actions=actions, reward_estimation=reward_estimation, baseline=baseline, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective + ) + + # TODO: state value doesn't exist for Beta + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + reward_estimation = dict( + horizon=3, predict_horizon_values='late', predict_terminal_values=True, + return_processing='batch_normalization' + ) + # Implicit baseline = policy + baseline_optimizer = dict(optimizer='adam', learning_rate=1e-3) + baseline_objective = 'state_value' + self.unittest( + actions=actions, reward_estimation=reward_estimation, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective + ) + + reward_estimation = dict( + horizon=3, predict_horizon_values='late', predict_action_values=True, + predict_terminal_values=True, return_processing='batch_normalization' + ) + # TODO: baseline horizon has to be equal to policy horizon + # (Not specifying customized distributions since action value doesn't exist for Beta) + baseline = dict( + type='parametrized_distributions', network=dict(type='auto', size=7, depth=1, rnn=2) + ) + baseline_optimizer = dict(optimizer='adam', learning_rate=1e-3) + baseline_objective = 'action_value' + self.unittest( + reward_estimation=reward_estimation, baseline=baseline, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective + ) + + def test_advantage_estimate(self): + self.start_tests(name='advantage estimate') + + reward_estimation = dict( + horizon=3, estimate_advantage=True, predict_horizon_values=False, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ) + # TODO: baseline horizon has to be equal to policy horizon + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=2)) + # Implicit advantage computation as part of loss + self.unittest(reward_estimation=reward_estimation, baseline=baseline) + + # TODO: action value doesn't exist for Beta + actions = dict( + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=2), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0) + ) + reward_estimation = dict( + horizon='episode', estimate_advantage=True, predict_horizon_values='early', + predict_action_values=True, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ) + # Implicit baseline = policy + # Implicit baseline_optimizer = 1.0 + baseline_objective = 'state_value' + self.unittest( + actions=actions, reward_estimation=reward_estimation, + baseline_objective=baseline_objective + ) + + reward_estimation = dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + predict_terminal_values=True, + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ) + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=1)) + baseline_optimizer = dict(optimizer='adam', learning_rate=1e-3) + baseline_objective = 'state_value' + self.unittest( + reward_estimation=reward_estimation, baseline=baseline, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective + ) diff --git a/test/test_runner.py b/test/test_runner.py new file mode 100644 index 000000000..60914cda1 --- /dev/null +++ b/test/test_runner.py @@ -0,0 +1,210 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from tensorforce import Runner +from test.unittest_base import UnittestBase + + +class TestRunner(UnittestBase, unittest.TestCase): + + def test_single(self): + self.start_tests(name='single') + + agent = self.agent_spec() + environment = self.environment_spec() + runner = Runner(agent=agent, environment=environment) + + # default + runner.run(num_episodes=3, use_tqdm=False) + self.finished_test() + + # evaluation + runner.run(num_episodes=1, use_tqdm=False, evaluation=False) + self.finished_test() + + # episode callback + callback_episode_frequency = 2 + self.num_callbacks = 0 + + def callback(r, p): + self.num_callbacks += 1 + self.assertEqual(r.episodes, self.num_callbacks * callback_episode_frequency) + + runner.run( + num_episodes=5, callback=callback, + callback_episode_frequency=callback_episode_frequency, use_tqdm=False + ) + self.finished_test() + + # timestep callback + callback_timestep_frequency = 3 + self.num_callbacks = 0 + + def callback(r, p): + self.num_callbacks += 1 + self.assertEqual( + r.episode_timestep[p], self.num_callbacks * callback_timestep_frequency + ) + + runner.run( + num_episodes=1, callback=callback, + callback_timestep_frequency=callback_timestep_frequency, use_tqdm=False + ) + self.finished_test() + + # multiple callbacks + self.is_callback1 = False + self.is_callback2 = False + + def callback1(r, p): + self.is_callback1 = True + + def callback2(r, p): + self.is_callback2 = True + + runner.run( + num_episodes=1, callback=[callback1, callback2], + callback_timestep_frequency=callback_timestep_frequency, use_tqdm=False + ) + runner.close() + self.finished_test(assertion=(self.is_callback1 and self.is_callback2)) + + def test_unbatched(self): + self.start_tests(name='unbatched') + + agent = self.agent_spec() + environment = self.environment_spec() + + # default + runner = Runner(agent=agent, environment=environment, num_parallel=2) + runner.run(num_episodes=3, use_tqdm=False) + runner.close() + self.finished_test() + + # episode callback + runner = Runner(agent=agent, environments=[environment, environment]) + callback_episode_frequency = 2 + self.num_callbacks = 0 + + def callback(r, p): + self.num_callbacks += 1 + if self.num_callbacks % 2 == 0: + self.assertEqual(min(r.episode_timestep), 0) + self.assertEqual(r.episodes, self.num_callbacks * callback_episode_frequency) + + runner.run( + num_episodes=5, callback=callback, + callback_episode_frequency=callback_episode_frequency, use_tqdm=False, + sync_episodes=True + ) + self.finished_test() + + # timestep callback + callback_timestep_frequency = 3 + + def callback(r, p): + self.assertEqual(r.episode_timestep[p] % callback_timestep_frequency, 0) + + runner.run( + num_episodes=2, callback=callback, + callback_timestep_frequency=callback_timestep_frequency, use_tqdm=False + ) + runner.close() + self.finished_test() + + # evaluation synced + runner = Runner(agent=agent, environment=environment, num_parallel=2, evaluation=True) + self.num_evaluations = 0 + + def evaluation_callback(r): + self.num_evaluations += 1 + + runner.run( + num_episodes=1, use_tqdm=False, evaluation_callback=evaluation_callback, + sync_episodes=True + ) + self.finished_test(assertion=(self.num_evaluations == 1)) + + # evaluation non-synced + runner.run(num_episodes=1, use_tqdm=False, evaluation_callback=evaluation_callback) + runner.close() + self.finished_test(assertion=(self.num_evaluations >= 2)) + + def test_batched(self): + self.start_tests(name='batched') + + agent = self.agent_spec() + environment = self.environment_spec() + + # default + runner = Runner(agent=agent, environment=environment, num_parallel=2) + runner.run(num_episodes=3, use_tqdm=False, batch_agent_calls=True) + runner.close() + self.finished_test() + + # episode callback + runner = Runner(agent=agent, environments=[environment, environment]) + callback_episode_frequency = 2 + self.num_callbacks = 0 + + def callback(r, p): + self.num_callbacks += 1 + if self.num_callbacks % 2 == 0: + self.assertEqual(min(r.episode_timestep), 0) + self.assertEqual(r.episodes, self.num_callbacks * callback_episode_frequency) + + runner.run( + num_episodes=5, callback=callback, + callback_episode_frequency=callback_episode_frequency, use_tqdm=False, + batch_agent_calls=True, sync_episodes=True + ) + self.finished_test() + + # timestep callback + callback_timestep_frequency = 3 + + def callback(r, p): + self.assertEqual(r.episode_timestep[p] % callback_timestep_frequency, 0) + + runner.run( + num_episodes=2, callback=callback, + callback_timestep_frequency=callback_timestep_frequency, use_tqdm=False, + batch_agent_calls=True + ) + runner.close() + self.finished_test() + + # evaluation synced + runner = Runner(agent=agent, environment=environment, num_parallel=2, evaluation=True) + self.num_evaluations = 0 + + def evaluation_callback(r): + self.num_evaluations += 1 + + runner.run( + num_episodes=1, use_tqdm=False, evaluation_callback=evaluation_callback, + batch_agent_calls=True, sync_episodes=True + ) + self.finished_test(assertion=(self.num_evaluations == 1)) + + # evaluation non-synced + runner.run( + num_episodes=1, use_tqdm=False, evaluation_callback=evaluation_callback, + batch_agent_calls=True + ) + runner.close() + self.finished_test(assertion=(self.num_evaluations >= 2)) diff --git a/test/test_saving.py b/test/test_saving.py new file mode 100644 index 000000000..097c7ee83 --- /dev/null +++ b/test/test_saving.py @@ -0,0 +1,431 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from tempfile import TemporaryDirectory +import unittest + +import numpy as np +import tensorflow as tf + +from tensorforce import Agent, Environment, Runner +from test.unittest_base import UnittestBase + + +class TestSaving(UnittestBase, unittest.TestCase): + + def test_modules(self): + self.start_tests(name='modules') + + with TemporaryDirectory() as directory: + agent, environment = self.prepare(config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + )) + states = environment.reset() + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + weights0 = agent.model.policy.network.layers[0][1].weights.numpy() + for module in agent.model.tensorforce_submodules: + path = module.save(directory=directory) + assert path == os.path.join(directory, module.full_name.replace('/', '.')) + agent.close() + environment.close() + + agent, environment = self.prepare(config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + )) + states = environment.reset() + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + for module in agent.model.tensorforce_submodules: + module.restore(directory=directory) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + files = set(os.listdir(path=directory)) + self.assertTrue(len(files), 2 * len(agent.model.this_submodules)) + for module in agent.model.tensorforce_submodules: + filename = module.full_name.replace('/', '.') + self.assertTrue(filename + '.index' in files) + self.assertTrue(filename + '.data-00000-of-00001' in files) + + agent.close() + environment.close() + + self.finished_test() + + def test_explicit(self): + # FEATURES.MD + self.start_tests(name='explicit') + + with TemporaryDirectory() as directory: + update = dict(unit='episodes', batch_size=1) + agent, environment = self.prepare( + memory=50, update=update, config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + ) + ) + states = environment.reset() + + # save: default checkpoint format + weights0 = agent.model.policy.network.layers[0][1].weights.numpy() + agent.save(directory=directory) + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + self.assertEqual(agent.timesteps, 1) + agent.close() + self.finished_test() + + # load: only directory + agent = Agent.load(directory=directory, environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 0) + self.finished_test() + + # one timestep + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + + # save: numpy format, append timesteps + agent.save(directory=directory, format='numpy', append='timesteps') + agent.close() + self.finished_test() + + # load: numpy format and directory + agent = Agent.load(directory=directory, format='numpy', environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 1) + self.finished_test() + + # one timestep + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + # save: numpy format, append timesteps + agent.save(directory=directory, format='numpy', append='timesteps') + agent.close() + self.finished_test() + + # load: numpy format and directory + agent = Agent.load(directory=directory, format='numpy', environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 2) + self.finished_test() + + # one episode + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + + # save: hdf5 format, filename, append episodes + weights1 = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(not np.allclose(weights1, weights0)) + self.assertEqual(agent.episodes, 1) + agent.save(directory=directory, filename='agent2', format='hdf5', append='episodes') + agent.close() + self.finished_test() + + # env close + environment.close() + + # differing agent config: update, parallel_interactions + # TODO: episode length, others? + environment = Environment.create(environment=self.environment_spec()) + + # load: filename (hdf5 format implicit) + update['batch_size'] = 2 + agent = Agent.load( + directory=directory, filename='agent2', environment=environment, update=update, + parallel_interactions=2 + ) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights1)) + self.assertEqual(agent.episodes, 1) + agent.close() + self.finished_test() + + # load: tensorflow format (filename explicit) + # TODO: parallel_interactions=2 should be possible, but problematic if all variables are + # saved in checkpoint format + agent = Agent.load( + directory=directory, format='checkpoint', environment=environment, update=update, + parallel_interactions=1 + ) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 0) + self.assertEqual(agent.episodes, 0) + agent.close() + self.finished_test() + + # load: numpy format, full filename including timesteps suffix + agent = Agent.load( + directory=directory, filename='agent-1', format='numpy', environment=environment, + update=update, parallel_interactions=2 + ) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 1) + self.assertEqual(agent.episodes, 0) + self.finished_test() + + # three episodes (due to batch_size change, mismatch with loaded internal last_update) + for _ in range(3): + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + self.assertEqual(agent.updates, 1) + + # save: saved-model format, append updates + agent.save(directory=directory, format='saved-model', append='updates') + agent.close() + + # saved-model functions + def batch(x): + return np.expand_dims(x, axis=0) + + def unbatch(x): + if isinstance(x, tf.Tensor): + x = x.numpy() + if x.shape == (1,): + return x.item() + else: + return np.squeeze(x, axis=0) + + def recursive_map(function, dictionary): + mapped = dict() + for key, value in dictionary.items(): + if isinstance(value, dict): + mapped[key] = recursive_map(function, value) + else: + mapped[key] = function(value) + return mapped + + # load: saved-model format + agent = tf.saved_model.load(export_dir=os.path.join(directory, 'agent-1')) + + # one episode + states = environment.reset() + internals = agent.initial_internals() + internals = recursive_map(batch, internals) + terminal = False + while not terminal: + states = dict(states) + auxiliaries = dict( + int_action1=dict(mask=batch(states.pop('int_action1_mask'))), + int_action2=dict(mask=batch(states.pop('int_action2_mask'))), + int_action3=dict(mask=batch(states.pop('int_action3_mask'))) + ) + states = recursive_map(batch, states) + actions_internals = agent.act(states, internals, auxiliaries, False) + actions = actions_internals['actions'] + internals = actions_internals['internals'] + actions = recursive_map(unbatch, actions) + states, terminal, _ = environment.execute(actions=actions) + + environment.close() + + # saved-model format with singleton state/action, no internals, no masking + policy = dict(network=dict(type='auto', size=8, depth=1, rnn=False)) + update = dict(unit='episodes', batch_size=1) + baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=False)) + agent, environment = self.prepare( + states=dict(type='float', shape=(), min_value=1.0, max_value=2.0), + actions=dict(type='float', shape=(), min_value=1.0, max_value=2.0), + policy=policy, update=update, baseline=baseline, config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + ) + ) + + # one episode + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + agent.observe(terminal=terminal, reward=reward) + self.assertEqual(agent.updates, 1) + + # save: saved-model format, append updates + agent.save(directory=directory, format='saved-model', append='updates') + agent.close() + + # load: saved-model format + agent = tf.saved_model.load(export_dir=os.path.join(directory, 'agent-1')) + + # one episode + states = environment.reset() + terminal = False + while not terminal: + states = batch(states) + actions = agent.act(states, True) + actions = unbatch(actions) + states, terminal, _ = environment.execute(actions=actions) + + environment.close() + + files = set(os.listdir(path=directory)) + self.assertTrue(files == { + 'agent.json', 'agent-1', 'agent-1.data-00000-of-00001', 'agent-1.index', + 'agent-1.npz', 'agent2.json', 'agent-2.npz', 'agent2-1.hdf5', 'checkpoint' + }) + files = set(os.listdir(path=os.path.join(directory, 'agent-1'))) + self.assertTrue(files == {'assets', 'saved_model.pb', 'variables'}) + files = set(os.listdir(path=os.path.join(directory, 'agent-1', 'variables'))) + self.assertTrue(files == {'variables.data-00000-of-00001', 'variables.index'}) + + self.finished_test() + + def test_config(self): + # FEATURES.MD + self.start_tests(name='config') + + with TemporaryDirectory() as directory: + # save: before first timestep + update = dict(unit='episodes', batch_size=1) + saver = dict(directory=directory, frequency=1) + agent, environment = self.prepare( + update=update, saver=saver, config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + ) + ) + weights0 = agent.model.policy.network.layers[0][1].weights.numpy() + states = environment.reset() + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + agent.close() + self.finished_test() + + # load: from given directory + agent = Agent.load(directory=directory, environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights0)) + self.assertEqual(agent.timesteps, 0) + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + self.assertTrue(updated) + weights1 = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(not np.allclose(weights1, weights0)) + timesteps = agent.timesteps + agent.close() + self.finished_test() + + # load: from given directory + agent = Agent.load(directory=directory, environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights1)) + self.assertEqual(agent.timesteps, timesteps) + agent.close() + environment.close() + self.finished_test() + + # create, not load + agent, environment = self.prepare( + update=update, saver=saver, config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + ) + ) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(not np.allclose(x, weights0)) + self.assertTrue(not np.allclose(x, weights1)) + self.assertEqual(agent.timesteps, 0) + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + self.assertTrue(updated) + weights2 = agent.model.policy.network.layers[0][1].weights.numpy() + agent.close() + self.finished_test() + + # load: from given directory + agent = Agent.load(directory=directory, environment=environment) + x = agent.model.policy.network.layers[0][1].weights.numpy() + self.assertTrue(np.allclose(x, weights2)) + agent.close() + environment.close() + self.finished_test() + + files = set(os.listdir(path=directory)) + self.assertTrue(files == { + 'agent.json', 'agent-0.data-00000-of-00001', 'agent-0.index', + 'agent-1.data-00000-of-00001', 'agent-1.index', 'checkpoint' + }) + + self.finished_test() + + def test_load_performance(self): + self.start_tests(name='load-performance') + + environment = Environment.create(environment='CartPole-v1') + + agent = dict(directory='test/data', filename='ppo-checkpoint', format='checkpoint') + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, use_tqdm=False, evaluation=True) + self.assertTrue(all(episode_return == 500.0 for episode_return in runner.episode_returns)) + runner.close() + self.finished_test() + + agent = dict(directory='test/data', filename='ppo-checkpoint', format='numpy') + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, use_tqdm=False, evaluation=True) + self.assertTrue(all(episode_return == 500.0 for episode_return in runner.episode_returns)) + runner.close() + self.finished_test() + + agent = dict(directory='test/data', filename='ppo-checkpoint', format='hdf5') + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=10, use_tqdm=False, evaluation=True) + self.assertTrue(all(episode_return == 500.0 for episode_return in runner.episode_returns)) + runner.close() + self.finished_test() + + agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint') + + # 10 episodes + for _ in range(10): + states = environment.reset() + terminal = False + episode_return = 0.0 + while not terminal: + states = np.expand_dims(states, axis=0) + auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) + actions = agent.act(states, auxiliaries, True) + actions = actions.numpy().item() + states, terminal, reward = environment.execute(actions=actions) + episode_return += reward + self.assertEqual(episode_return, 500.0) + + environment.close() + self.finished_test() diff --git a/test/test_seed.py b/test/test_seed.py new file mode 100644 index 000000000..073f9e53a --- /dev/null +++ b/test/test_seed.py @@ -0,0 +1,130 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +import numpy as np + +from test.unittest_base import UnittestBase + + +class TestSeed(UnittestBase, unittest.TestCase): + + def test_seed(self): + self.start_tests() + + states = dict( + int_state=dict(type='int', shape=(2,), num_values=4), + float_state=dict(type='float', shape=(2,), min_value=1.0, max_value=2.0), + ) + actions = dict( + int_action=dict(type='int', shape=(2,), num_values=4), + float_action=dict(type='float', shape=(2,), min_value=1.0, max_value=2.0), + ) + + agent, environment = self.prepare( + states=states, actions=actions, config=dict( + seed=0, device='CPU', eager_mode=True, create_debug_assertions=True, + tf_log_level=20 + ) + ) + + print_environment = False + print_agent = False + + states = environment.reset() + if print_environment: + print(states['int_state']) + print(states['float_state']) + else: + self.assertTrue(expr=np.allclose(a=states['int_state'], b=np.asarray([2, 3]))) + self.assertTrue(expr=np.allclose( + a=states['float_state'], b=np.asarray([1.33350747, 1.92415877]) + )) + + actions = agent.act(states=states) + if print_agent: + print(actions['int_action']) + print(actions['float_action']) + else: + self.assertTrue(expr=np.allclose(a=actions['int_action'], b=np.asarray([3, 0]))) + self.assertTrue(expr=np.allclose( + a=actions['float_action'], b=np.asarray([1.5877049, 1.4052042]) + )) + + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + if print_environment: + print(states['int_state']) + print(states['float_state']) + print(terminal, reward, updated) + else: + self.assertTrue(expr=np.allclose(a=states['int_state'], b=np.asarray([1, 2]))) + self.assertTrue(expr=np.allclose( + a=states['float_state'], b=np.asarray([1.71033683, 1.0078841]) + )) + self.assertFalse(expr=terminal) + self.assertEqual(first=reward, second=0.6888437030500962) + self.assertFalse(expr=updated) + + actions = agent.act(states=states) + if print_agent: + print(actions['int_action']) + print(actions['float_action']) + else: + self.assertTrue(expr=np.allclose(a=actions['int_action'], b=np.asarray([3, 1]))) + self.assertTrue(expr=np.allclose( + a=actions['float_action'], b=np.asarray([1.5590835, 1.4156684]) + )) + + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + if print_environment: + print(states['int_state']) + print(states['float_state']) + print(terminal, reward, updated) + else: + self.assertTrue(expr=np.allclose(a=states['int_state'], b=np.asarray([1, 3]))) + self.assertTrue(expr=np.allclose( + a=states['float_state'], b=np.asarray([1.60039224, 1.58873961]) + )) + self.assertFalse(expr=terminal) + self.assertEqual(first=reward, second=0.515908805880605) + self.assertFalse(expr=updated) + + actions = agent.act(states=states) + if print_agent: + print(actions['int_action']) + print(actions['float_action']) + else: + self.assertTrue(expr=np.allclose(a=actions['int_action'], b=np.asarray([0, 3]))) + self.assertTrue(expr=np.allclose( + a=actions['float_action'], b=np.asarray([1.449745, 1.6435499]) + )) + + states, terminal, reward = environment.execute(actions=actions) + updated = agent.observe(terminal=terminal, reward=reward) + if print_environment: + print(states['int_state']) + print(states['float_state']) + print(terminal, reward, updated) + else: + self.assertTrue(expr=np.allclose(a=states['int_state'], b=np.asarray([1, 0]))) + self.assertTrue(expr=np.allclose( + a=states['float_state'], b=np.asarray([1.13346147, 1.98058013]) + )) + self.assertFalse(expr=terminal) + self.assertEqual(first=reward, second=-0.15885683833831) + self.assertFalse(expr=updated) diff --git a/test/test_specifications.py b/test/test_specifications.py new file mode 100644 index 000000000..09673f489 --- /dev/null +++ b/test/test_specifications.py @@ -0,0 +1,87 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +from tensorforce.core import tf_function +from tensorforce.core.memories import Replay +from tensorforce.core.networks import LayerbasedNetwork +from test.unittest_base import UnittestBase + + +class TestNetwork(LayerbasedNetwork): + + def __init__(self, name, inputs_spec, outputs=None): + super().__init__(name=name, inputs_spec=inputs_spec, outputs=outputs) + + self.layer1 = self.submodule(name='dense0', module=dict(type='dense', size=8)) + self.layer2 = self.submodule(name='dense1', module=dict(type='dense', size=8)) + + @tf_function(num_args=4) + def apply(self, x, horizons, internals, deterministic, independent): + x = self.layer2.apply(x=self.layer1.apply(x=next(iter(x.values())))) + return x, dict() + + +class TestSpecifications(UnittestBase, unittest.TestCase): + + def specification_unittest(self, network, memory): + states = dict(type='float', shape=(3,), min_value=1.0, max_value=2.0) + + agent, environment = self.prepare( + states=states, policy=dict(network=network), memory=memory + ) + + states = environment.reset() + internals = agent.initial_internals() + actions, internals = agent.act(states=states, internals=internals, independent=True) + states, terminal, reward = environment.execute(actions=actions) + + agent.close() + environment.close() + + self.finished_test() + + def test_specifications(self): + # SPECIFICATION.MD + self.start_tests() + + # default + self.specification_unittest( + network=dict(type='layered', layers=[dict(type='dense', size=8)]), + memory=dict(type='replay', capacity=100) + ) + + # json + self.specification_unittest( + network='test/data/network.json', + memory=dict(type='test/data/memory.json', capacity=100) + ) + + # module + self.specification_unittest( + network='test.test_specifications', + memory=dict(type='tensorforce.core.memories.Replay', capacity=100) + ) + + # callable + self.specification_unittest( + network=TestNetwork, memory=dict(type=Replay, capacity=100) + ) + + # default (+firstarg) + self.specification_unittest( + network=[dict(type='dense', size=8)], memory=dict(capacity=100) + ) diff --git a/test/test_summaries_tracking.py b/test/test_summaries_tracking.py new file mode 100644 index 000000000..987a02829 --- /dev/null +++ b/test/test_summaries_tracking.py @@ -0,0 +1,132 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from tempfile import TemporaryDirectory +import unittest + +from test.unittest_base import UnittestBase + + +class TestSummaries(UnittestBase, unittest.TestCase): + + tracked_tensors = frozenset([ + # distribution + 'agent/policy/bool_action_distribution/probability', + 'agent/policy/int_action1_distribution/probabilities', + 'agent/policy/int_action2_distribution/probabilities', + 'agent/policy/int_action2_distribution/temperature', + 'agent/policy/int_action3_distribution/probabilities', + 'agent/policy/int_action3_distribution/temperature', + 'agent/policy/gaussian_action1_distribution/mean', + 'agent/policy/gaussian_action1_distribution/stddev', + 'agent/policy/gaussian_action2_distribution/mean', + 'agent/policy/gaussian_action2_distribution/stddev', + 'agent/policy/beta_action_distribution/alpha', + 'agent/policy/beta_action_distribution/beta', + # entropy + 'agent/entropy', + 'agent/entropies/bool_action', + 'agent/entropies/int_action1', + 'agent/entropies/int_action2', + 'agent/entropies/int_action3', + 'agent/entropies/gaussian_action1', + 'agent/entropies/gaussian_action2', + 'agent/entropies/beta_action', + # kl-divergence + 'agent/kl-divergence', + 'agent/kl-divergences/bool_action', + 'agent/kl-divergences/int_action1', + 'agent/kl-divergences/int_action2', + 'agent/kl-divergences/int_action3', + 'agent/kl-divergences/gaussian_action1', + 'agent/kl-divergences/gaussian_action2', + 'agent/kl-divergences/beta_action', + # loss + 'agent/policy-loss', + 'agent/policy-objective-loss', + 'agent/policy-regularization-loss', + 'agent/baseline-loss', + 'agent/baseline-objective-loss', + 'agent/baseline-regularization-loss', + # parameters + 'agent/policy_optimizer/policy_optimizer/learning_rate/learning_rate', + 'agent/reward_horizon/reward_horizon', + 'agent/exploration/exploration', + # reward + 'agent/preprocessed-reward', + 'agent/preprocessed-episode-return', + 'agent/update-return', + 'agent/update-processed-return', + 'agent/update-advantage', + 'agent/update-processed-advantage' + ]) + + def test_summaries(self): + # FEATURES.MD + self.start_tests() + + learning_rate = dict( + type='linear', unit='updates', num_steps=10, initial_value=1e-3, final_value=1e-4 + ) + horizon = dict(type='linear', unit='episodes', num_steps=2, initial_value=2, final_value=4) + exploration = dict( + type='exponential', unit='timesteps', num_steps=5, initial_value=0.1, decay_rate=0.5 + ) + + with TemporaryDirectory() as directory: + agent, environment = self.prepare( + optimizer=dict(optimizer='adam', learning_rate=learning_rate), + reward_estimation=dict( + horizon=horizon, estimate_advantage=True, predict_horizon_values='late', + reward_processing=dict(type='clipping', lower=-1.0, upper=1.0), + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), + exploration=exploration, + config=dict( + device='CPU', eager_mode=False, create_debug_assertions=True, tf_log_level=20 + ), + summarizer=dict(directory=directory, summaries='all') + ) + + updates = 0 + episodes = 0 + while episodes < 3 or updates < 3: + states = environment.reset() + terminal = False + while not terminal: + actions = agent.act(states=states) + states, terminal, reward = environment.execute(actions=actions) + updates += int(agent.observe(terminal=terminal, reward=reward)) + tracked = set(agent.tracked_tensors()) + self.assertEqual(tracked, self.__class__.tracked_tensors, msg=( + tracked - self.__class__.tracked_tensors, + self.__class__.tracked_tensors - tracked + )) + episodes += 1 + + # print(set(agent.tracked_tensors()) - self.__class__.tracked_tensors) + + agent.close() + environment.close() + + directories = os.listdir(path=directory) + self.assertEqual(len(directories), 1) + files = os.listdir(path=os.path.join(directory, directories[0])) + self.assertEqual(len(files), 1) + self.assertTrue(files[0].startswith('events.out.tfevents.')) + + self.finished_test() diff --git a/test/unittest_base.py b/test/unittest_base.py new file mode 100644 index 000000000..96d828a27 --- /dev/null +++ b/test/unittest_base.py @@ -0,0 +1,208 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from copy import deepcopy +from datetime import datetime +import sys + +from tensorforce import Agent, Environment, Runner +from test.unittest_environment import UnittestEnvironment + + +class UnittestBase(object): + """Unit-test base class.""" + + # Environment + states = dict( + bool_state=dict(type='bool', shape=(1,)), + int_state=dict(type='int', shape=(1, 2), num_values=4), + float_state=dict(type='float', shape=(), min_value=1.0, max_value=2.0) + ) + actions = dict( + # Also in: test_agents, test_layers, test_objectives, test_optimizers, test_policies, + # test_reward_estimation, test_seed + bool_action=dict(type='bool', shape=(1,)), + int_action1=dict(type='int', shape=(), num_values=4), + int_action2=dict(type='int', shape=(2,), num_values=3), + int_action3=dict(type='int', shape=(2, 1), num_values=3), + gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), + gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0), + beta_action=dict(type='float', shape=(), min_value=1.0, max_value=2.0) + ) + min_timesteps = 5 + max_episode_timesteps = 10 + experience_update = True + + # Agent + agent = dict( + # Also in: test_policies, test_reward_estimation + policy=dict(network=dict(type='auto', size=8, depth=1, rnn=2), distributions=dict( + # As part of baseline also in: test_optimizers + int_action2=dict(type='categorical', temperature_mode='predicted'), + int_action3=dict(type='categorical', temperature_mode='global'), + gaussian_action2=dict( + type='gaussian', stddev_mode='global', bounded_transform='clipping' + ), beta_action='beta' + )), + update=4, + # Also in: test_summaries_tracking + optimizer=dict(optimizer='adam', learning_rate=1e-3), + objective='policy_gradient', + # Also in: test_documentation, test_summaries_tracking + reward_estimation=dict( + horizon=3, estimate_advantage=True, predict_horizon_values='late', + reward_processing=dict(type='clipping', lower=-1.0, upper=1.0), + return_processing=dict(type='clipping', lower=-1.0, upper=1.0), + advantage_processing='batch_normalization' + ), + baseline=dict(network=dict(type='auto', size=7, depth=1, rnn=1)), + baseline_optimizer=dict(optimizer='adam', learning_rate=1e-3), + baseline_objective='state_value', l2_regularization=0.01, entropy_regularization=0.01, + state_preprocessing='linear_normalization', + # Also in: test_summaries_tracking + exploration=0.01, + variable_noise=0.01, + # Config default changes need to be adapted everywhere (search "config=dict"): + # test_agents, test_examples, test_layers, test_precision, + # test_reward_estimation, test_saving, test_seed, test_summaries + config=dict(device='CPU', eager_mode=True, create_debug_assertions=True, tf_log_level=20), + tracking='all' + ) + + def start_tests(self, name=None): + """ + Start unit-test method. + """ + if name is None: + sys.stdout.write('\n{} {}: '.format( + datetime.now().strftime('%H:%M:%S'), self.__class__.__name__[4:] + )) + else: + sys.stdout.write('\n{} {} ({}): '.format( + datetime.now().strftime('%H:%M:%S'), self.__class__.__name__[4:], name + )) + sys.stdout.flush() + + def finished_test(self, assertion=None): + """ + Finished unit-test. + """ + if assertion is None: + assertion = True + else: + self.assertTrue(expr=assertion) + if assertion: + sys.stdout.write('.') + sys.stdout.flush() + + def environment_spec(self, states=None, actions=None): + if states is None: + states = deepcopy(self.__class__.states) + + if actions is None: + actions = deepcopy(self.__class__.actions) + + return dict( + environment=UnittestEnvironment, + max_episode_timesteps=self.__class__.max_episode_timesteps, + states=states, actions=actions, min_timesteps=self.__class__.min_timesteps + ) + + def agent_spec(self, **agent): + for key, value in self.__class__.agent.items(): + if key not in agent: + agent[key] = value + + return dict(agent=agent) + + def prepare(self, environment=None, states=None, actions=None, **agent): + """ + Generic unit-test preparation. + """ + if environment is None: + environment = self.environment_spec(states=states, actions=actions) + environment = Environment.create(environment=environment) + + else: + environment = Environment.create( + environment=environment, max_episode_timesteps=self.__class__.max_episode_timesteps + ) + + agent = self.agent_spec(**agent) + + agent = Agent.create(agent=agent, environment=environment) + assert agent.__class__.__name__ in ('ConstantAgent', 'RandomAgent') or \ + isinstance(agent.model.get_architecture(), str) + + return agent, environment + + def execute(self, agent, environment, num_episodes=None, experience_update=None): + if num_episodes is None: + num_updates = 2 + else: + num_updates = None + + runner = Runner(agent=agent, environment=environment) + runner.run(num_episodes=num_episodes, num_updates=num_updates, use_tqdm=False) + runner.close() + + # Test experience-update, independent, deterministic + if experience_update or (experience_update is None and self.__class__.experience_update): + + for episode in range(2 if num_episodes is None else num_episodes): + episode_states = list() + episode_internals = list() + episode_actions = list() + episode_terminal = list() + episode_reward = list() + states = environment.reset() + internals = agent.initial_internals() + terminal = False + deterministic = True + while not terminal: + episode_states.append(states) + episode_internals.append(internals) + actions, internals = agent.act( + states=states, internals=internals, independent=True, + deterministic=deterministic + ) + deterministic = not deterministic + episode_actions.append(actions) + states, terminal, reward = environment.execute(actions=actions) + episode_terminal.append(terminal) + episode_reward.append(reward) + agent.experience( + states=episode_states, internals=episode_internals, actions=episode_actions, + terminal=episode_terminal, reward=episode_reward + ) + agent.update() + + self.finished_test() + + def unittest( + self, environment=None, states=None, actions=None, num_episodes=None, + experience_update=None, **agent + ): + """ + Generic unit-test. + """ + agent, environment = self.prepare( + environment=environment, states=states, actions=actions, **agent + ) + + self.execute( + agent=agent, environment=environment, num_episodes=num_episodes, + experience_update=experience_update + ) diff --git a/test/unittest_environment.py b/test/unittest_environment.py new file mode 100644 index 000000000..e9bb9773d --- /dev/null +++ b/test/unittest_environment.py @@ -0,0 +1,235 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from collections import OrderedDict +from random import random + +import numpy as np + +from tensorforce import Environment, TensorforceError, util + + +class UnittestEnvironment(Environment): + """ + Unit-test mock environment. + + Args: + states: States specification. + actions: Actions specification. + min_timesteps: Minimum number of timesteps. + """ + + def __init__(self, states, actions, min_timesteps): + super().__init__() + + self.states_spec = OrderedDict((name, states[name]) for name in sorted(states)) + self.actions_spec = OrderedDict((name, actions[name]) for name in sorted(actions)) + self.min_timesteps = min_timesteps + + self.random_states = self.__class__.random_states_function( + states_spec=self.states_spec, actions_spec=self.actions_spec + ) + self.is_valid_actions = self.__class__.is_valid_actions_function( + actions_spec=self.actions_spec + ) + + def states(self): + return self.states_spec + + def actions(self): + return self.actions_spec + + @classmethod + def random_states_function(cls, states_spec, actions_spec=None): + if actions_spec is None: + if 'shape' in states_spec: + return (lambda: cls.random_state_function(state_spec=states_spec)()) + else: + return (lambda: { + name: cls.random_state_function(state_spec=state_spec)() + for name, state_spec in states_spec.items() + }) + + elif 'shape' in states_spec: + if 'type' in actions_spec: + + def fn(): + random_states = cls.random_state_function(state_spec=states_spec)() + if actions_spec['type'] == 'int': + if not isinstance(random_states, dict): + random_states = dict(state=random_states) + mask = cls.random_mask(action_spec=actions_spec) + random_states['action_mask'] = mask + return random_states + + else: + + def fn(): + random_states = cls.random_state_function(state_spec=states_spec)() + for name, action_spec in actions_spec.items(): + if action_spec['type'] == 'int': + if not isinstance(random_states, dict): + random_states = dict(state=random_states) + mask = cls.random_mask(action_spec=action_spec) + random_states[name + '_mask'] = mask + return random_states + + else: + if 'type' in actions_spec: + + def fn(): + random_states = { + name: cls.random_state_function(state_spec=state_spec)() + for name, state_spec in states_spec.items() + } + if actions_spec['type'] == 'int': + mask = cls.random_mask(action_spec=actions_spec) + random_states['action_mask'] = mask + return random_states + + else: + + def fn(): + random_states = { + name: cls.random_state_function(state_spec=state_spec)() + for name, state_spec in states_spec.items() + } + for name, action_spec in actions_spec.items(): + if action_spec['type'] == 'int': + mask = cls.random_mask(action_spec=action_spec) + random_states[name + '_mask'] = mask + return random_states + + return fn + + @classmethod + def random_state_function(cls, state_spec): + shape = state_spec.get('shape', ()) + dtype = state_spec.get('type', 'float') + + if dtype == 'bool': + return (lambda: np.random.random_sample(size=shape) >= 0.5) + + elif dtype == 'int': + num_values = state_spec['num_values'] + return (lambda: np.random.randint(low=0, high=num_values, size=shape)) + + elif dtype == 'float': + if 'min_value' in state_spec: + min_value = state_spec['min_value'] + max_value = state_spec['max_value'] + return (lambda: ( + min_value + (max_value - min_value) * np.random.random_sample(size=shape) + )) + + else: + return (lambda: np.random.standard_normal(size=shape)) + + @classmethod + def random_mask(cls, action_spec): + shape = action_spec.get('shape', ()) + (action_spec['num_values'],) + mask = np.random.random_sample(size=shape) + min_mask = np.amin(mask, -1, keepdims=True) + max_mask = np.amax(mask, -1, keepdims=True) + threshold = np.random.random_sample(size=shape) + mask = mask < min_mask + threshold * (max_mask - min_mask) + assert mask.any(-1).all() and not mask.all(-1).any() + return mask + + @classmethod + def is_valid_actions_function(cls, actions_spec): + if 'type' in actions_spec: + return (lambda actions, states: + cls.is_valid_action_function(action_spec=actions_spec)(actions, 'action', states) + ) + + else: + return (lambda actions, states: all( + cls.is_valid_action_function(action_spec=action_spec)( + action=actions[name], name=name, states=states + ) for name, action_spec in actions_spec.items() + )) + + @classmethod + def is_valid_action_function(cls, action_spec): + dtype = action_spec['type'] + shape = action_spec.get('shape', ()) + + if dtype == 'bool': + return (lambda action, name, states: ( + ( + isinstance(action, util.py_dtype('bool')) and shape == () + ) or ( + isinstance(action, np.ndarray) and + action.dtype == util.np_dtype('bool') and action.shape == shape + ) + )) + + elif dtype == 'int': + num_values = action_spec['num_values'] + return (lambda action, name, states: ( + ( + isinstance(action, util.py_dtype('int')) and shape == () and + 0 <= action and action < num_values and states[name + '_mask'][action] + ) or ( + isinstance(action, np.ndarray) and action.dtype == util.np_dtype('int') and + action.shape == shape and (0 <= action).all() and + (action < num_values).all() and np.take_along_axis( + states[name + '_mask'], indices=np.expand_dims(action, axis=-1), axis=-1 + ).all() + ) + )) + + elif dtype == 'float': + if 'min_value' in action_spec: + min_value = action_spec['min_value'] + max_value = action_spec['max_value'] + return (lambda action, name, states: ( + ( + isinstance(action, util.py_dtype('float')) and shape == () and + min_value <= action and action <= max_value + ) or ( + isinstance(action, np.ndarray) and + action.dtype == util.np_dtype('float') and action.shape == shape and + (min_value <= action).all() and (action <= max_value).all() + ) + )) + + else: + return (lambda action, name, states: ( + ( + isinstance(action, util.py_dtype('float')) and shape == () + ) or ( + isinstance(action, np.ndarray) and + action.dtype == util.np_dtype('float') and action.shape == shape + ) + )) + + def reset(self): + self.timestep = 0 + self._states = self.random_states() + return self._states + + def execute(self, actions): + if not self.is_valid_actions(actions, self._states): + print(actions, self._states, self.actions_spec) + raise TensorforceError.value(name='execute', argument='actions', value=actions) + + self.timestep += 1 + self._states = self.random_states() + terminal = (self.timestep >= self.min_timesteps and random() < 0.25) + reward = -1.0 + 2.0 * random() + + return self._states, terminal, reward diff --git a/tune.py b/tune.py new file mode 100644 index 000000000..f8c0df62a --- /dev/null +++ b/tune.py @@ -0,0 +1,356 @@ +# Copyright 2020 Tensorforce Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import argparse +import importlib +import math +import os +import pickle + +import ConfigSpace as cs +from hpbandster.core.nameserver import NameServer, nic_name_to_host +from hpbandster.core.result import json_result_logger, logged_results_to_HBS_result +from hpbandster.core.worker import Worker +from hpbandster.optimizers import BOHB +import numpy as np + +from tensorforce import Runner, util + + +class TensorforceWorker(Worker): + + def __init__( + self, *args, environment, num_episodes, base, runs_per_round, max_episode_timesteps=None, + num_parallel=None, **kwargs + ): + super().__init__(*args, **kwargs) + self.environment = environment + self.max_episode_timesteps = max_episode_timesteps + self.num_episodes = num_episodes + self.base = base + self.runs_per_round = runs_per_round + self.num_parallel = num_parallel + + def compute(self, config_id, config, budget, working_directory): + budget = math.log(budget, self.base) + assert abs(budget - round(budget)) < util.epsilon + budget = round(budget) + assert budget < len(self.runs_per_round) + num_runs = self.runs_per_round[budget] + + update = dict(unit='episodes', batch_size=config['batch_size'], frequency=1) + policy = dict(network=dict(type='auto', size=64, depth=2, rnn=False)) + optimizer = dict( + optimizer='adam', learning_rate=config['learning_rate'], + multi_step=config['multi_step'], linesearch_iterations=5 # , subsampling_fraction=256 + ) + + if config['clipping_value'] > 1.0: + objective = dict( + type='policy_gradient', + importance_sampling=(config['importance_sampling'] == 'yes') + ) + else: + objective = dict( + type='policy_gradient', + importance_sampling=(config['importance_sampling'] == 'yes'), + clipping_value=config['clipping_value'] + ) + + if config['baseline'] == 'no': + predict_horizon_values = False + estimate_advantage = False + predict_action_values = False + baseline = None + baseline_optimizer = None + baseline_objective = None + + elif config['baseline'] == 'same': + predict_horizon_values = 'early' + estimate_advantage = (config['estimate_advantage'] == 'yes') + predict_action_values = False + baseline = None + baseline_optimizer = config['baseline_weight'] + baseline_objective = dict(type='value', value='state') + + elif config['baseline'] == 'yes': + predict_horizon_values = 'early' + estimate_advantage = (config['estimate_advantage'] == 'yes') + predict_action_values = False + baseline = dict(network=dict(type='auto', size=64, depth=2, rnn=False)) + baseline_optimizer = config['baseline_weight'] + baseline_objective = dict(type='value', value='state') + + else: + assert False + + reward_estimation = dict( + horizon=config['horizon'], discount=config['discount'], + predict_horizon_values=predict_horizon_values, estimate_advantage=estimate_advantage, + predict_action_values=predict_action_values + ) + + if config['entropy_regularization'] < 1e-5: + entropy_regularization = 0.0 + else: + entropy_regularization = config['entropy_regularization'] + + agent = dict( + policy=policy, memory='recent', update=update, optimizer=optimizer, objective=objective, + reward_estimation=reward_estimation, baseline=baseline, + baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective, + entropy_regularization=entropy_regularization + ) + + average_reward = list() + final_reward = list() + rewards = list() + + for n in range(num_runs): + if self.num_parallel is None: + runner = Runner( + agent=agent, environment=self.environment, + max_episode_timesteps=self.max_episode_timesteps + ) + runner.run(num_episodes=self.num_episodes, use_tqdm=False) + else: + runner = Runner( + agent=agent, environment=self.environment, + max_episode_timesteps=self.max_episode_timesteps, + num_parallel=min(self.num_parallel, config['batch_size']), + remote='multiprocessing' + ) + runner.run( + num_episodes=self.num_episodes, batch_agent_calls=True, sync_episodes=True, + use_tqdm=False + ) + runner.close() + + average_reward.append(float(np.mean(runner.episode_returns, axis=0))) + final_reward.append(float(np.mean(runner.episode_returns[-20:], axis=0))) + rewards.append(list(runner.episode_returns)) + + mean_average_reward = float(np.mean(average_reward, axis=0)) + mean_final_reward = float(np.mean(final_reward, axis=0)) + loss = -(mean_average_reward + mean_final_reward) + + return dict(loss=loss, info=dict(rewards=rewards)) + + @staticmethod + def get_configspace(): + configspace = cs.ConfigurationSpace() + + batch_size = cs.hyperparameters.UniformIntegerHyperparameter( + name='batch_size', lower=1, upper=20, log=True + ) + configspace.add_hyperparameter(hyperparameter=batch_size) + + learning_rate = cs.hyperparameters.UniformFloatHyperparameter( + name='learning_rate', lower=1e-5, upper=1e-1, log=True + ) + configspace.add_hyperparameter(hyperparameter=learning_rate) + + multi_step = cs.hyperparameters.UniformIntegerHyperparameter( + name='multi_step', lower=1, upper=20, log=True + ) + configspace.add_hyperparameter(hyperparameter=multi_step) + + horizon = cs.hyperparameters.UniformIntegerHyperparameter( + name='horizon', lower=1, upper=100, log=True + ) + configspace.add_hyperparameter(hyperparameter=horizon) + + discount = cs.hyperparameters.UniformFloatHyperparameter( + name='discount', lower=0.8, upper=1.0, log=True + ) + configspace.add_hyperparameter(hyperparameter=discount) + + importance_sampling = cs.hyperparameters.CategoricalHyperparameter( + name='importance_sampling', choices=('no', 'yes') + ) + configspace.add_hyperparameter(hyperparameter=importance_sampling) + + # > 1.0: off (ln(1.3) roughly 1/10 of ln(5e-2)) + clipping_value = cs.hyperparameters.UniformFloatHyperparameter( + name='clipping_value', lower=5e-2, upper=1.3, log=True + ) + configspace.add_hyperparameter(hyperparameter=clipping_value) + + baseline = cs.hyperparameters.CategoricalHyperparameter( + name='baseline', choices=('no', 'same', 'yes') + ) + configspace.add_hyperparameter(hyperparameter=baseline) + + baseline_weight = cs.hyperparameters.UniformFloatHyperparameter( + name='baseline_weight', lower=1e-2, upper=1e2 + ) + configspace.add_hyperparameter(hyperparameter=baseline_weight) + + estimate_advantage = cs.hyperparameters.CategoricalHyperparameter( + name='estimate_advantage', choices=('no', 'yes') + ) + configspace.add_hyperparameter(hyperparameter=estimate_advantage) + + # < 1e-5: off (ln(3e-6) roughly 1/10 of ln(1e-5)) + entropy_regularization = cs.hyperparameters.UniformFloatHyperparameter( + name='entropy_regularization', lower=3e-6, upper=1.0, log=True + ) + configspace.add_hyperparameter(hyperparameter=entropy_regularization) + + # configspace.add_condition(condition=cs.EqualsCondition( + # child=clipping_value, parent=importance_sampling, value='yes' + # )) + configspace.add_condition(condition=cs.NotEqualsCondition( + child=estimate_advantage, parent=baseline, value='no' + )) + configspace.add_condition(condition=cs.NotEqualsCondition( + child=baseline_weight, parent=baseline, value='no' + )) + + return configspace + + +def main(): + parser = argparse.ArgumentParser( + description='Tensorforce hyperparameter tuner, using BOHB optimizer (Bayesian Optimization ' + 'and Hyperband)' + ) + # Environment arguments (from run.py) + parser.add_argument( + '-e', '--environment', type=str, + help='Environment (name, configuration JSON file, or library module)' + ) + parser.add_argument( + '-l', '--level', type=str, default=None, + help='Level or game id, like `CartPole-v1`, if supported' + ) + parser.add_argument( + '-m', '--max-episode-timesteps', type=int, default=None, + help='Maximum number of timesteps per episode' + ) + parser.add_argument( + '--import-modules', type=str, default=None, + help='Import comma-separated modules required for environment' + ) + # Runner arguments (from run.py) + parser.add_argument('-n', '--episodes', type=int, help='Number of episodes') + parser.add_argument( + '-p', '--num-parallel', type=int, default=None, + help='Number of environment instances to execute in parallel' + ) + # Tuner arguments + parser.add_argument( + '-r', '--runs-per-round', type=str, default='1,2,5,10', + help='Comma-separated number of runs per optimization round, each with a successively ' + 'smaller number of candidates' + ) + parser.add_argument( + '-s', '--selection-factor', type=int, default=3, + help='Selection factor n, meaning that one out of n candidates in each round advances to ' + 'the next optimization round' + ) + parser.add_argument( + '-i', '--num-iterations', type=int, default=1, + help='Number of optimization iterations, each consisting of a series of optimization ' + 'rounds with an increasingly reduced candidate pool' + ) + parser.add_argument( + '-d', '--directory', type=str, default='tuner', help='Output directory' + ) + parser.add_argument( + '--restore', type=str, default=None, help='Restore from given directory' + ) + parser.add_argument('--id', type=str, default='worker', help='Unique worker id') + args = parser.parse_args() + + if args.import_modules is not None: + for module in args.import_modules.split(','): + importlib.import_module(name=module) + + environment = dict(environment=args.environment) + if args.level is not None: + environment['level'] = args.level + + if False: + host = nic_name_to_host(nic_name=None) + port = 123 + else: + host = 'localhost' + port = None + + runs_per_round = tuple(int(x) for x in args.runs_per_round.split(',')) + print('Bayesian Optimization and Hyperband optimization') + print(f'{args.num_iterations} iterations of each {len(runs_per_round)} rounds:') + for n, num_runs in enumerate(runs_per_round, start=1): + num_candidates = round(math.pow(args.selection_factor, len(runs_per_round) - n)) + print(f'round {n}: {num_candidates} candidates, each {num_runs} runs') + print() + + server = NameServer(run_id=args.id, working_directory=args.directory, host=host, port=port) + nameserver, nameserver_port = server.start() + + worker = TensorforceWorker( + environment=environment, max_episode_timesteps=args.max_episode_timesteps, + num_episodes=args.episodes, base=args.selection_factor, runs_per_round=runs_per_round, + num_parallel=args.num_parallel, run_id=args.id, nameserver=nameserver, + nameserver_port=nameserver_port, host=host + ) + worker.run(background=True) + + if args.restore is None: + previous_result = None + else: + previous_result = logged_results_to_HBS_result(directory=args.restore) + + result_logger = json_result_logger(directory=args.directory, overwrite=True) + + optimizer = BOHB( + configspace=worker.get_configspace(), eta=args.selection_factor, min_budget=0.9, + max_budget=math.pow(args.selection_factor, len(runs_per_round) - 1), run_id=args.id, + working_directory=args.directory, nameserver=nameserver, nameserver_port=nameserver_port, + host=host, result_logger=result_logger, previous_result=previous_result + ) + # BOHB(configspace=None, eta=3, min_budget=0.01, max_budget=1, min_points_in_model=None, + # top_n_percent=15, num_samples=64, random_fraction=1 / 3, bandwidth_factor=3, + # min_bandwidth=1e-3, **kwargs) + # Master(run_id, config_generator, working_directory='.', ping_interval=60, + # nameserver='127.0.0.1', nameserver_port=None, host=None, shutdown_workers=True, + # job_queue_sizes=(-1,0), dynamic_queue_size=True, logger=None, result_logger=None, + # previous_result = None) + # logger: logging.logger like object, the logger to output some (more or less meaningful) + # information + + results = optimizer.run(n_iterations=args.num_iterations) + # optimizer.run(n_iterations=1, min_n_workers=1, iteration_kwargs={}) + # min_n_workers: int, minimum number of workers before starting the run + + optimizer.shutdown(shutdown_workers=True) + server.shutdown() + + with open(os.path.join(args.directory, 'results.pkl'), 'wb') as filehandle: + pickle.dump(results, filehandle) + + print('Best found configuration: {}'.format( + results.get_id2config_mapping()[results.get_incumbent_id()]['config'] + )) + print('Runs:', results.get_runs_by_id(config_id=results.get_incumbent_id())) + print('A total of {} unique configurations where sampled.'.format( + len(results.get_id2config_mapping()) + )) + print('A total of {} runs where executed.'.format(len(results.get_all_runs()))) + + +if __name__ == '__main__': + main()