% ====================================================================== % This function implements the value iteration algorithm for the brick. % It returns the optimal value function J and policy PI. % - Rick % % You need to implement: % 1) lqr regulator task w/ discrete actions % 3) oscillating brick task % 2) lqr regulator task w/ continuous actions % ====================================================================== function [J,PI] = brick_vi global dt; % define the mesh points as a row vector (YOU FILL THIS IN) q_bins = [ ]; qdot_bins = []; if (isempty(q_bins) || isempty(qdot_bins)) error('you need to define the mesh'); end % The discrete actions, defined as a row vector (FOR PARTS a-c) % NOTE: PART d REQUIRES THE CONTINUOUS ACTION IMPLEMENTATION a = []; if (isempty(a)) error('you need to define the action set'); end % dynamics dt dt = 1e-2; % create the mesh [q qdot] = ndgrid(q_bins,qdot_bins); s = [reshape(q,1,numel(q)); reshape(qdot,1,numel(qdot))]; ns = size(s,2); na = size(a,2); % generate all possible state and action pairs S = repmat(s,1,na); % repeat s na times A = reshape(repmat(a,ns,1),1,ns*na); % repeat a ns times % compute the one-step dynamics Sn = S + dynamics(S,A).*dt; % Compute the transition matrix disp('Computing Transition Matrix...'); [Pi,P] = volumetric_interp(s,Sn,q_bins,qdot_bins); % Compute the one-step cost C = reshape(cost(S,A),ns,na); % Setup value iteration (you shouldn't need to change this) J = zeros(ns,1); % arbitrary initialization converged = 0.1; % value converged threshold gamma = 0.99; % discount factor iter = 1; err = 1e6; % Iterate the value estimate % USE THE VALUE ITERATION UPDATE GIVEN HERE FOR PARTS a-c. FOR PART d, YOU % WILL NEED TO MODIFY THIS UPDATE TO USE CONTINUOUS ACTIONS. while (err > converged) [Jnew, PI] = min(C + gamma*reshape(sum(P.*J(Pi),1),ns,na),[],2); err = max(abs(Jnew-J)); disp(['iteration = ',num2str(iter),' ; max_err = ',num2str(err)]); J = Jnew; iter = iter+1; vi_plot(J,a(PI),q_bins,qdot_bins); end % === HINT ============================================================ % For implementing cont. actions, you need to do the following: % 1) compute the gradients of the J wrt x using the function 'get_grad' % 2) get the R matrix using 'get_QR' (remember to divide the discrete R by dt) % 3) compute the optimal action using the gradients of J % 4) compute Jnew by computing the one-step-cost and the cost-to-go for the % next states % ===================================================================== disp('Value Estimate converged!'); subplot(2,1,1);xlabel('q'); ylabel('q_{dot}'); title('u'); colorbar; subplot(2,1,2);xlabel('q'); ylabel('q_{dot}'); title('Value function'); colorbar; disp('Press Enter to simulate...');pause; % simulate a trajectory PI = a(PI)'; T = 20; disp_dts = 5; xtraj = zeros(2,T/dt); x = [-3 0]'; for i=1:T/dt xtraj(:,i) = x; % FOR PART d, MAKE USE OF CONTINOUS ACTIONS INSTEAD % OF THE INTERPOLATED DISCRETE ACTIONS GIVEN HERE. [ind,coef] = volumetric_interp(s,x,q_bins,qdot_bins); u = sum(coef.*PI(ind),1); % === HINT ============================================================ % For implementing cont. actions, you need to follow a similar procedure % to the one for computing Jnew above % ===================================================================== if (mod(i,disp_dts)==0) draw((i-1)*dt,x); end x = x + dynamics(x,u).*dt; end end % end of brick_vi % ========================================================== % This function performs volumetric interpolation on the % state(s) Sn, returning the box indices(Pi) and weights(P) % (a.k.a. transition probabilities) % The input is: % s: the mesh % Sn: the states to interpolate for % [q-bins, qdot_bins]: the bins used to create the mesh % ========================================================== function [Pi,P] = volumetric_interp(s,Sn,q_bins,qdot_bins) ns = size(Sn,2); Pi = zeros(4,ns); P = Pi; % impose limits and wrapping on the state Sn = normalize(Sn,q_bins,qdot_bins); % compute each transition individualy for i=1:ns if (mod(i,4e3)==0) disp([num2str((i/ns)*1e2),'% done']); end % lower left corner ind_q = max([find(q_bins <= Sn(1,i),1,'last') 1]); ind_qdot = max([find(qdot_bins <= Sn(2,i),1,'last') 1]); offset = [0 0;1 0;0 1;1 1]; if (ind_q == length(q_bins)) offset(:,1) = -offset(:,1); end if (ind_qdot == length(qdot_bins)) offset(:,2) = -offset(:,2); end % compute the total area of the containing box totl_area = abs(q_bins(ind_q+offset(2,1))-q_bins(ind_q))*... abs(qdot_bins(ind_qdot+offset(3,2))-qdot_bins(ind_qdot)); % compute the four corner indices and weights for j=1:4 state = [q_bins(ind_q+offset(j,1)); qdot_bins(ind_qdot+offset(j,2))]; Pi(j,i) = find(sum(abs(s-repmat(state,1,size(s,2))),1)==0); P(5-j,i) = (abs(Sn(1,i)-q_bins(ind_q+offset(j,1)))*abs(Sn(2,i)-qdot_bins(ind_qdot+offset(j,2))))/totl_area; end end end % ============================================================= % This function returns the gradients of the value function % J (a column vector) for a given state x = [q qdot]'. % The inputs are: % J: the value function % x: the state you want the gradients for % s: the mesh over the state space % [q_bins,qdot_bins]: the bins used to create the mesh for s % ============================================================= function dJdx = get_grad(J,x,s,q_bins,qdot_bins) nx = size(x,2); dJdx = zeros(size(x)); % impose limits and wrapping on the state x = normalize(x,q_bins,qdot_bins); % compute each gradient individualy % (a vectorized form would be more efficient) for i=1:nx if (mod(i,4e3)==0) disp([num2str((i/nx)*1e2),'% done']); end [ind,coef] = volumetric_interp(s,x(:,i),q_bins,qdot_bins); Jcor = J(ind); % the value function at the corners q = s(1,ind); qdot = s(2,ind); % the state at the corners dJdq = (Jcor(2)-Jcor(1))./(q(2)-q(1)); dJdqdot = (Jcor(3)-Jcor(1))./(qdot(3)-qdot(1)); dJdx(:,i) = [dJdq; dJdqdot]; end end % ============================================================= % This function defines the continuous dynamics of the brick % ============================================================= function xdot = dynamics(x,u) xdot = [x(2,:); u]; end % =================================================================== % This function defines the instantaneous cost (i.e. g(x,u)) % YOU SHOULD FILL THIS IN. % =================================================================== function C = cost(X,u) C = 0; error('you need to define the cost function'); % remove this line once you've done it end % ========================================================= % This just makes the cost matrices accessible to the rest % of the script (e.g. you need R for the cont. action % implementation). % ========================================================= function [Q,R] = get_QR global dt; % DEFINE THESE COST MATRICES CORRECTLY. Q = diag([0 0]).*dt; % <== the dt just discretizes R = 0*dt; error('you need to define the cost matrices'); % remove this when you've done it end % ============================================================== % This function imposes limits on the state % ============================================================== function s = normalize(s,q_bins,qdot_bins) % impose limits N = size(s,2); smax = repmat([q_bins(end);qdot_bins(end)],1,N); ind = s>smax; s(ind) = smax(ind); smin = repmat([q_bins(1);qdot_bins(1)],1,N); ind = s