Ensemble Investment Strategies Based on Reinforcement Learning

<table class="table-group" id="tab1"><tr><td><table class="table"><tr><td class="thead-hr" colspan="1"><hr/></td></tr><tr><td class="align_left">Input: environment of the stock market</td></tr><tr><td class="align_left">Output: estimated optimal strategy <svg height="11.5564pt" id="M33" style="vertical-align:-2.26807pt" version="1.1" viewbox="-0.0498162 -9.28833 23.3394 11.5564" width="23.3394pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M574 449L545 460C526 432 516 430 487 430C404 430 311 435 226 435C104 435 56 379 25 341L43 318C81 354 121 372 181 372C161 246 87 53 23 3L30 -12C48 -12 88 -4 113 11C157 75 207 248 232 371L386 367L326 109C321 86 318 66 318 50C318 4 339 -12 366 -12C410 -12 461 21 505 69L492 96C467 79 434 60 418 60C406 60 400 78 411 147C422 217 439 300 457 366C487 366 524 367 536 370C547 385 558 408 574 449Z"></path></g><g transform="matrix(.013,0,0,-0.013,7.684,0)"><path d="M300 -147C201 -63 143 98 143 270S200 602 300 686L282 710C136 610 70 450 70 271V270C70 89 136 -72 282 -170L300 -147Z"></path></g><g transform="matrix(.013,0,0,-0.013,12.182,0)"><path d="M475 507C475 612 440 712 326 712C139 712 23 420 23 215C23 96 58 -12 180 -12C369 -12 475 293 475 507ZM391 522C391 486 387 448 379 394H126C155 538 222 677 310 677C386 677 391 571 391 522ZM373 346C344 193 283 22 189 22C126 22 106 114 106 196C106 243 111 293 118 346H373Z"></path></g><g transform="matrix(.013,0,0,-0.013,18.657,0)"><path d="M275 270C275 450 212 609 64 710L45 686C145 604 203 442 203 270S147 -63 45 -147L64 -170C213 -68 275 89 275 270Z"></path></g></svg></td></tr><tr><td class="align_left">Initial setup of actor and critic networks</td></tr><tr><td class="align_left"><b>Repeat</b></td></tr><tr><td class="align_left"> <b>For</b> episodes = 0, 1, 2, …, N <b>do:</b></td></tr><tr><td class="align_left">  Get state <svg height="8.8423pt" id="M34" style="vertical-align:-0.2064009pt" version="1.1" viewbox="-0.0498162 -8.6359 6.25863 8.8423" width="6.25863pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g></svg> and calculate <svg height="11.5564pt" id="M35" style="vertical-align:-2.26807pt" version="1.1" viewbox="-0.0498162 -9.28833 47.2577 11.5564" width="47.2577pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M574 449L545 460C526 432 516 430 487 430C404 430 311 435 226 435C104 435 56 379 25 341L43 318C81 354 121 372 181 372C161 246 87 53 23 3L30 -12C48 -12 88 -4 113 11C157 75 207 248 232 371L386 367L326 109C321 86 318 66 318 50C318 4 339 -12 366 -12C410 -12 461 21 505 69L492 96C467 79 434 60 418 60C406 60 400 78 411 147C422 217 439 300 457 366C487 366 524 367 536 370C547 385 558 408 574 449Z"></path></g><g transform="matrix(.013,0,0,-0.013,7.684,0)"><path d="M300 -147C201 -63 143 98 143 270S200 602 300 686L282 710C136 610 70 450 70 271V270C70 89 136 -72 282 -170L300 -147Z"></path></g><g transform="matrix(.013,0,0,-0.013,12.182,0)"><path d="M686 28C612 35 607 44 591 112C563 234 541 360 519 489L489 666L457 658L147 121C100 40 89 36 24 28L17 0H240L250 28C168 34 159 41 190 101L262 237H482C495 180 503 137 510 91C517 47 514 35 441 28L433 0H677L686 28ZM475 280H285L429 541H431L475 280Z"></path></g><g transform="matrix(.013,0,0,-0.013,21.317,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g><g transform="matrix(.013,0,0,-0.013,27.453,0)"><path d="M114 412C81 412 58 388 58 355C58 321 81 297 113 297S169 321 169 355C169 388 145 412 114 412ZM95 130C70 130 46 114 46 88C46 72 54 65 59 64C93 56 121 33 121 -3C121 -41 93 -68 45 -88L56 -118C117 -99 186 -56 186 22C186 91 131 130 95 130Z"></path></g><g transform="matrix(.013,0,0,-0.013,32.596,0)"><path d="M475 507C475 612 440 712 326 712C139 712 23 420 23 215C23 96 58 -12 180 -12C369 -12 475 293 475 507ZM391 522C391 486 387 448 379 394H126C155 538 222 677 310 677C386 677 391 571 391 522ZM373 346C344 193 283 22 189 22C126 22 106 114 106 196C106 243 111 293 118 346H373Z"></path></g><g transform="matrix(.013,0,0,-0.013,39.072,0)"><path d="M162 -163V703H101V-163H162Z"></path></g><g transform="matrix(.013,0,0,-0.013,42.491,0)"><path d="M275 270C275 450 212 609 64 710L45 686C145 604 203 442 203 270S147 -63 45 -147L64 -170C213 -68 275 89 275 270Z"></path></g></svg> to get action <svg height="8.68572pt" id="M36" style="vertical-align:-0.0498209pt" version="1.1" viewbox="-0.0498162 -8.6359 9.2729 8.68572" width="9.2729pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M686 28C612 35 607 44 591 112C563 234 541 360 519 489L489 666L457 658L147 121C100 40 89 36 24 28L17 0H240L250 28C168 34 159 41 190 101L262 237H482C495 180 503 137 510 91C517 47 514 35 441 28L433 0H677L686 28ZM475 280H285L429 541H431L475 280Z"></path></g></svg></td></tr><tr><td class="align_left">  <b>IF</b> the episode does not end there:</td></tr><tr><td class="align_left">   Get <svg height="11.6425pt" id="M37" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -11.4361 10.0542 11.6425" width="10.0542pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,6.136,-5.741)"><path d="M310 541L304 571C290 586 211 619 185 610L80 76L131 52L310 541Z"></path></g></svg> with reward <svg height="6.1673pt" id="M38" style="vertical-align:-0.2063904pt" version="1.1" viewbox="-0.0498162 -5.96091 6.40217 6.1673" width="6.40217pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M471 456L444 459C426 433 414 430 388 430C324 430 270 434 216 434C103 434 51 374 23 338L43 317C96 366 146 380 221 375L154 109C149 86 147 68 147 52C147 4 168 -12 197 -12C240 -12 291 25 334 71L320 96C295 75 268 58 252 58C238 58 227 79 238 138C251 211 272 296 292 372C310 372 332 368 350 368C391 368 421 369 434 371C444 388 455 413 471 456Z"></path></g></svg></td></tr><tr><td class="align_left">   Using critic networks to obtain return values to estimate <i>Q</i></td></tr><tr><td class="align_left">   Calculating the gradient using Q values and updating the actor network</td></tr><tr><td class="align_left">   Updating the critic network to reduce the difference</td></tr><tr><td class="align_left">   Update status <svg height="8.8423pt" id="M39" style="vertical-align:-0.2064009pt" version="1.1" viewbox="-0.0498162 -8.6359 6.25863 8.8423" width="6.25863pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g></svg></td></tr><tr><td class="align_left">  <b>End</b></td></tr><tr><td class="align_left"> <b>End</b></td></tr><tr><td class="align_left"><b>To convergence</b></td></tr><tr class="table-tr"><td colspan="1"><hr class="tbody-hr"/></td></tr></table></td></tr></table>

<div>Pseudo-code for the A2C model.</div>

Scientific Programming

tab1

Table 1

Table 1: Ensemble Investment Strategies Based on Reinforcement Learning